예제 #1
0
def write_example(e, filename):
    make_path(filename)
    with codecs.open(filename, mode="w", encoding="utf-8") as fh:
        for k in e:
            if (isinstance(e[k], np.ndarray) or isinstance(e[k], list)
                    or isinstance(e[k], int) or isinstance(e[k], float)
                    or isinstance(e[k], str)):
                fh.write(str(k) + "\n")
                fh.write(str(e[k]) + "\n\n")
예제 #2
0
def squad2sentences(input_path,
                    output_path,
                    paragraphs_path,
                    max_length=100,
                    min_length=5,
                    max_plength=400,
                    min_plength=5):
    make_path(output_path)
    make_path(paragraphs_path)
    outfile = open(output_path, 'w', encoding='utf8')
    outfile_p = open(paragraphs_path, 'w', encoding='utf8')
    with codecs.open(input_path, "r", encoding='utf8') as infile:
        source = json.load(infile)
        pid = 0
        sid = 0
        for article in tqdm(source["data"]):
            for para in article["paragraphs"]:
                context = para["context"]
                p = context
                len_p = len(p.split())
                if len_p >= max_plength or len_p <= min_plength:
                    continue
                p = normalize_text(p)
                outfile_p.write(
                    str(pid) + "\t" + p.rstrip().replace("\n", "\\n") + "\n")
                sentences = nltk.sent_tokenize(context)
                for s in sentences:
                    len_s = len(s.split())
                    if len_s >= max_length or len_s <= min_length:
                        continue
                    s = normalize_text(s)
                    outfile.write(
                        str(pid) + "\t" + str(sid) + "\t" +
                        s.rstrip().replace("\n", "\\n") + "\n")
                    sid += 1
                pid += 1
    infile.close()
    outfile.close()
    outfile_p.close()
예제 #3
0
def wiki2sentences(input_path,
                   output_path,
                   paragraphs_path,
                   max_length=100,
                   min_length=5,
                   max_plength=400,
                   min_plength=5):
    make_path(output_path)
    make_path(paragraphs_path)
    outfile = open(output_path, 'w', encoding='utf8')
    outfile_p = open(paragraphs_path, 'w', encoding='utf8')
    with codecs.open(input_path, encoding='utf8') as infile:
        data = json.load(infile)
    pid = 0
    sid = 0
    for k in data:
        paragraph_list = data[k]
        for p in paragraph_list:
            len_p = len(p.split())
            if len_p >= max_plength or len_p <= min_plength:
                continue
            p = normalize_text(p)
            outfile_p.write(
                str(pid) + "\t" + p.rstrip().replace("\n", "\\n") + "\n")
            sentences = nltk.sent_tokenize(p)
            for s in sentences:
                len_s = len(s.split())
                if len_s >= max_length or len_s <= min_length:
                    continue
                s = normalize_text(s)
                outfile.write(
                    str(pid) + "\t" + str(sid) + "\t" +
                    s.rstrip().replace("\n", "\\n") + "\n")
                sid += 1
            pid += 1
    infile.close()
    outfile.close()
    outfile_p.close()
예제 #4
0
def write_2d_list(list_2d, filename):
    make_path(filename)
    with codecs.open(filename, mode="w", encoding="utf-8") as fh:
        fh.writelines('\t'.join(str(j) for j in i) + '\n' for i in list_2d)
예제 #5
0
def write_counter(c, filename):
    ordered_c = counter2ordered_dict(c)
    make_path(filename)
    with codecs.open(filename, mode="w", encoding="utf-8") as fh:
        for k in ordered_c:
            fh.write(str(k) + " " + str(ordered_c[k]) + "\n")
예제 #6
0
def write_dict(d, filename):
    make_path(filename)
    with codecs.open(filename, mode="w", encoding="utf-8") as fh:
        for k in d:
            fh.write(str(k) + " " + str(d[k]) + "\n")