for filename in filenames: samples = json_load(filename) output = [] for sample in tqdm(samples): topic = sample['topic'] if (topic not in stats_dict): stats_dict[topic] = 1 else: stats_dict[topic] += 1 stats_dict_sorted = { k: v for k, v in sorted( stats_dict.items(), key=lambda item: item[1], reverse=True) } for topic in stats_dict_sorted: output.append({"topic": topic, "amount": stats_dict_sorted[topic]}) return output if __name__ == '__main__': input_num = int(sys.argv[1]) filenames = [] for i in range(1, input_num + 1): print(sys.argv[i + 1]) filenames.append(sys.argv[i + 1]) output_file = sys.argv[input_num + 2] json_dump(convert_samples(filenames), output_file)
import sys from tqdm import tqdm from data.utils import json_load, json_dump ''' cmd args data\empatheticdialogue\train_json.json data\empatheticdialogue\train_json_classification.json data\empatheticdialogue\test_json.json data\empatheticdialogue\test_json_classification.json data\empatheticdialogue\valid_json.json data\empatheticdialogue\valid_json_classification.json ''' def convert_samples(filename): samples = json_load(filename) output = [] for sample in tqdm(samples): topic = sample['context'] content = sample['content'] for turn in content: output.append({'text': turn, 'topic': topic}) return output if __name__ == '__main__': input_file = sys.argv[1] output_file = sys.argv[2] json_dump(convert_samples(input_file), output_file)
txt, act, emotion = txt.split( ' __eou__ '), act.split(), emotion.split() content = [] for t, a, e in zip(txt, act, emotion): utterance = { 'text': t.rstrip('__eou__').strip(), 'act': acts_dict[int(a)], 'emotion': emotions_dict[int(e)] } content.append(utterance) sample = {'topic': topic, 'length': len(content), 'content': content} samples.append(sample) samples.sort(key=lambda x: x['length']) return samples if __name__ == '__main__': overall_filename, topic_filename = sys.argv[1], sys.argv[2] topics_list = { o: int(t) for o, t in zip(get_text(overall_filename), get_text(topic_filename)) } input_text, input_act, input_emotion = sys.argv[3], sys.argv[4], sys.argv[ 5] output_filename = sys.argv[6] samples = get_samples(input_text, input_act, input_emotion, topics_list) json_dump(samples, output_filename)
s = re.sub(r'[.]+[\n]+[,]', ".\n", s) s = s.split() return ' '.join(s), len(s) def get_samples(filename, word_threshold=100): samples = [] with open(filename, 'r', encoding="utf8") as f: data = csv.reader(f) # overview,headline,text,sectionLabel,title header = next(data) for i, row in enumerate(tqdm(data)): # currently using overview (not sure which is the best choice) # some seems to have repetition content = row[0].strip() content, length = clean(content) if length < word_threshold: sample = {'emotion': 'neutral', 'content': content} samples.append(sample) return samples if __name__ == '__main__': input_file = sys.argv[1] output_file = sys.argv[2] json_dump(get_samples(input_file), output_file)
return output def convert_samples(filename): samples = json_load(filename) output = [] output2 = [] random.seed(9) for sample in tqdm(samples): topic = sample['topic'] content = sample['content'] for turn in content: if random.random() >= 0.2: output.append({'text': turn['text'], 'topic': topic}) else: output2.append({'text': turn['text'], 'topic': topic}) result = [output, output2] return result if __name__ == '__main__': input_file = sys.argv[1] output_file = sys.argv[2] filter_list = ['Politics', 'Attitude & Emotion', 'Relationship', 'Health'] json_dump(convert_samples_with_filter(input_file, filter_list), output_file) # json_dump(convert_samples(input_file), output_file)