def write_example(e, filename): make_path(filename) with codecs.open(filename, mode="w", encoding="utf-8") as fh: for k in e: if (isinstance(e[k], np.ndarray) or isinstance(e[k], list) or isinstance(e[k], int) or isinstance(e[k], float) or isinstance(e[k], str)): fh.write(str(k) + "\n") fh.write(str(e[k]) + "\n\n")
def squad2sentences(input_path, output_path, paragraphs_path, max_length=100, min_length=5, max_plength=400, min_plength=5): make_path(output_path) make_path(paragraphs_path) outfile = open(output_path, 'w', encoding='utf8') outfile_p = open(paragraphs_path, 'w', encoding='utf8') with codecs.open(input_path, "r", encoding='utf8') as infile: source = json.load(infile) pid = 0 sid = 0 for article in tqdm(source["data"]): for para in article["paragraphs"]: context = para["context"] p = context len_p = len(p.split()) if len_p >= max_plength or len_p <= min_plength: continue p = normalize_text(p) outfile_p.write( str(pid) + "\t" + p.rstrip().replace("\n", "\\n") + "\n") sentences = nltk.sent_tokenize(context) for s in sentences: len_s = len(s.split()) if len_s >= max_length or len_s <= min_length: continue s = normalize_text(s) outfile.write( str(pid) + "\t" + str(sid) + "\t" + s.rstrip().replace("\n", "\\n") + "\n") sid += 1 pid += 1 infile.close() outfile.close() outfile_p.close()
def wiki2sentences(input_path, output_path, paragraphs_path, max_length=100, min_length=5, max_plength=400, min_plength=5): make_path(output_path) make_path(paragraphs_path) outfile = open(output_path, 'w', encoding='utf8') outfile_p = open(paragraphs_path, 'w', encoding='utf8') with codecs.open(input_path, encoding='utf8') as infile: data = json.load(infile) pid = 0 sid = 0 for k in data: paragraph_list = data[k] for p in paragraph_list: len_p = len(p.split()) if len_p >= max_plength or len_p <= min_plength: continue p = normalize_text(p) outfile_p.write( str(pid) + "\t" + p.rstrip().replace("\n", "\\n") + "\n") sentences = nltk.sent_tokenize(p) for s in sentences: len_s = len(s.split()) if len_s >= max_length or len_s <= min_length: continue s = normalize_text(s) outfile.write( str(pid) + "\t" + str(sid) + "\t" + s.rstrip().replace("\n", "\\n") + "\n") sid += 1 pid += 1 infile.close() outfile.close() outfile_p.close()
def write_2d_list(list_2d, filename): make_path(filename) with codecs.open(filename, mode="w", encoding="utf-8") as fh: fh.writelines('\t'.join(str(j) for j in i) + '\n' for i in list_2d)
def write_counter(c, filename): ordered_c = counter2ordered_dict(c) make_path(filename) with codecs.open(filename, mode="w", encoding="utf-8") as fh: for k in ordered_c: fh.write(str(k) + " " + str(ordered_c[k]) + "\n")
def write_dict(d, filename): make_path(filename) with codecs.open(filename, mode="w", encoding="utf-8") as fh: for k in d: fh.write(str(k) + " " + str(d[k]) + "\n")