in_dict["q_types"] = list(q_types) in_dict["q_is_paraphrase"] = i["is_paraphrase"] list_for_analysis.append(in_dict) # Store dict for analysis print ("Created the list of " + spl + " for analysis with the length of %d" % len(list_for_analysis)) f_pickle = open("gen_data/" + spl + "_list_for_analysis.pickle", "wb") pickle.dump(list_for_analysis, f_pickle, protocol=2) f_pickle.close() print ("Created the list of " + spl + " of raw data with the length of %d" % len(data_splits[spl])) # Store raw dict (all paragraphs ordered) f_pickle = open("gen_data/" + spl + "_raw_list.pickle", "wb") pickle.dump(data_splits[spl], f_pickle, protocol=2) f_pickle.close() print ("\n") for i in ["train", "validate", "test"]: file_name = i + ".txt" dep_file_name = i + "_to_parse.txt" print ("Packing %s set with the size of %d samples to file: %s and dep data to file %s" % (i, len(data_splits[i]), file_name, dep_file_name)) data_generator.generate_txt_file(data_splits[i], i + ".txt") dep_data_generator.generate_file(data_splits[i], i + "_to_parse.txt") # pprint(p_data) # print ("Length of returned data: %d" % len(p_data))
#### # INPUT FOR THE SCRIPT #### main_data_file = "data/data_all.json" #### # This is main data (from Tim) with open(main_data_file) as data_file: data = json.load(data_file) # Segment it into json segmented_data = parser.parse_data(data) annotated_paragraphs = [] batch_files = ["batch1.csv", "batch2.csv", "batch3.csv", "batch4.csv", "batch5.csv", "batch100.csv", "batch101.csv", "batch1sq.csv", "batch2sq.csv", "batch3sq.csv", "batch4sq.csv", "batch5sq.csv", "batch100sq.csv", "batch101sq.csv"] for i in batch_files: annotated_paragraphs.extend(csv_parser.extract_batch_file(segmented_data, i)) print ("Extracted %d paragraphs (questions)" % len(annotated_paragraphs)) # generate tsv file #print (annotated_paragraphs[0]) data_generator.generate_txt_file(annotated_paragraphs, "test.txt") dep_data_generator.generate_file(annotated_paragraphs, "data_to_parse.txt")