def merge_data_and_labels(labels_file, data_file, out_file): # load data set X_labels, Y_labels = load_data(labels_file) X_data, Y_data = load_data(data_file) with codecs.open(out_file, "wb", "utf-8") as f: for i in range(len(X_data)): f.write(u"{}\t{}\n".format(X_data[i].decode("utf-8"), Y_labels[i]))
def get_emotion_body_part_pairs(file_name): # load data set X_data, Y_data = load_data(file_name) Y = [s.split('_') for s in Y_data] emotions2body = {} emotions = Counter() for labelset in Y: body_parts = [lb for lb in labelset if lb in heem_body_part_labels] emotion_lbls = [lb for lb in labelset if lb in heem_emotion_labels] if body_parts and emotion_lbls: for em in emotion_lbls: for bp in body_parts: if not emotions2body.get(em): emotions2body[em] = Counter() emotions2body[em][bp] += 1 emotions[em] += 1 return emotions, emotions2body
args = parser.parse_args() input_dir = args.input_dir output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) pairs = Counter() text_files = glob.glob(os.path.join(input_dir, "*.txt")) for i, text_file in enumerate(text_files): print "({} of {}) {}".format((i + 1), len(text_files), text_file) text_id = text_file[-17:-4] X_data, Y_data = load_data(text_file) out_file = os.path.join(output_dir, os.path.basename(text_file)) for j, predicted in enumerate(Y_data): lbs = set(predicted.split("_")) - {"None"} emotion_labels = [l for l in lbs if l in heem_emotion_labels] ct_labels = [l for l in lbs if l in heem_concept_type_labels] if emotion_labels and ct_labels: for e in emotion_labels: for ct in ct_labels: pairs["{}\t{}".format(e, ct)] += 1 with codecs.open(out_file, "wb", "utf-8") as f: for pair, freq in pairs.most_common(): f.write("{}\t{}\t{}\n".format(text_id, pair, freq))
parser.add_argument('in_dir', help='the directory containing the' ' files with the correct labels.') parser.add_argument('out_file', help='csv file to write the output to') args = parser.parse_args() corpus = args.corpus in_dir = args.in_dir out_file = args.out_file data = {'#lines': [], '#emotional': [], 'avg_labels': []} index = [] # get # of lines` text_files = glob.glob('{}/*.txt'.format(in_dir)) for t in text_files: text_id = os.path.basename(t).replace('.txt', '') index.append(text_id) X_data, Y_data = load_data(t) data['#lines'].append(count_lines(t)) data['#emotional'].append(num_emotional_sentences(Y_data)) data['avg_labels'].append(average_number_of_labels(Y_data)) df = pd.DataFrame(data=data, index=index) # get time period corpus = pd.read_csv(corpus, sep='\t', header=None, index_col=0) corpus.loc[:, 'period'] = corpus.apply(get_tp, axis=1) # write result to file result = pd.concat([df, corpus['period']], axis=1) result.to_csv(out_file)
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('input_dir', help='the directory where the input text ' 'files can be found.') parser.add_argument('output_dir', help='the directory where the output ' 'files should be written.') args = parser.parse_args() input_dir = args.input_dir output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) text_files = [t for t in os.listdir(input_dir) if t.endswith('.txt')] for text_file in text_files: in_file = os.path.join(input_dir, text_file) x_data, y_data = load_data(in_file) labels = [y.split('_') for y in y_data] #new_labels = [] out_file = os.path.join(output_dir, text_file) #print out_file with codecs.open(out_file, 'wb', 'utf-8') as f: for i in range(len(labels)): ls = labels[i] #new_labels.append([heem_labels_en.get(l, 'None') for l in ls]) new_labels = [heem_labels_en.get(l, 'None') for l in ls] #print ls, new_labels f.write(u'{}\t{}\n'.format(x_data[i].decode('utf-8'), '_'.join(new_labels)))