Пример #1
0
            in_dict["q_types"] = list(q_types)

        in_dict["q_is_paraphrase"] = i["is_paraphrase"]

        list_for_analysis.append(in_dict)

    # Store dict for analysis
    print ("Created the list of " + spl + " for analysis with the length of %d" % len(list_for_analysis))
    f_pickle = open("gen_data/" + spl + "_list_for_analysis.pickle", "wb")
    pickle.dump(list_for_analysis, f_pickle, protocol=2)
    f_pickle.close()

    print ("Created the list of " + spl + " of raw data with the length of %d" % len(data_splits[spl]))
    # Store raw dict (all paragraphs ordered)
    f_pickle = open("gen_data/" + spl + "_raw_list.pickle", "wb")
    pickle.dump(data_splits[spl], f_pickle, protocol=2)
    f_pickle.close()

print ("\n")

for i in ["train", "validate", "test"]:
    file_name = i + ".txt"
    dep_file_name = i + "_to_parse.txt"
    print ("Packing %s set with the size of %d samples to file: %s and dep data to file %s" %
           (i, len(data_splits[i]), file_name, dep_file_name))
    data_generator.generate_txt_file(data_splits[i], i + ".txt")
    dep_data_generator.generate_file(data_splits[i], i + "_to_parse.txt")


# pprint(p_data)
# print ("Length of returned data: %d" % len(p_data))
Пример #2
0
####
# INPUT FOR THE SCRIPT
####

main_data_file = "data/data_all.json"

####


# This is main data (from Tim)
with open(main_data_file) as data_file:
    data = json.load(data_file)

# Segment it into json
segmented_data = parser.parse_data(data)

annotated_paragraphs = []
batch_files = ["batch1.csv", "batch2.csv", "batch3.csv", "batch4.csv", "batch5.csv", "batch100.csv", "batch101.csv",
               "batch1sq.csv", "batch2sq.csv", "batch3sq.csv", "batch4sq.csv", "batch5sq.csv",
               "batch100sq.csv", "batch101sq.csv"]

for i in batch_files:
    annotated_paragraphs.extend(csv_parser.extract_batch_file(segmented_data, i))

print ("Extracted %d paragraphs (questions)" % len(annotated_paragraphs))

# generate tsv file
#print (annotated_paragraphs[0])

data_generator.generate_txt_file(annotated_paragraphs, "test.txt")
dep_data_generator.generate_file(annotated_paragraphs, "data_to_parse.txt")