'body': body_unlemm,
            TEXT_RAW_FIELD_NAME: text_raw
        }
        add_retokenized_field(doc, TEXT_RAW_FIELD_NAME,
                              TEXT_BERT_TOKENIZED_NAME, bert_tokenizer)

        doc_str = json.dumps(doc) + '\n'
        return doc_str


proc_qty = args.proc_qty
print(f'Spanning {proc_qty} processes')
pool = multiprocessing.Pool(processes=proc_qty)
ln = 0
for doc_str in pool.imap(DocParseWorker(), inp_file, IMAP_PROC_CHUNK_QTY):
    ln = ln + 1
    if doc_str is not None:
        out_file.write(doc_str)
    else:
        # print('Misformatted line %d ignoring:' % ln)
        # print(line.replace('\t', '<field delimiter>'))
        print('Ignoring misformatted line %d' % ln)

    if ln % REPORT_QTY == 0:
        print('Processed %d docs' % ln)

print('Processed %d docs' % ln)

inp_file.close()
out_file.close()
        print(f"Ignoring query, which is found in specified query files. Raw query: '{query}' lemmatized query '{query_lemmas}'")

    query_toks = query_lemmas.split()
    if len(query_toks) >= minQueryTokQty:

        qrelList.append(QrelEntry(queryId=qid, docId=did, relGrade=1))

        # Entries are sorted by the query ID
        if prevQid != qid:
            doc = {DOCID_FIELD: qid,
                   TEXT_FIELD_NAME: query_lemmas,
                   TEXT_UNLEMM_FIELD_NAME: query_unlemm,
                   TEXT_RAW_FIELD_NAME: query.lower()}
            addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer)

            docStr = json.dumps(doc) + '\n'
            outFileQueries.write(docStr)

    prevQid = qid

    if ln % REPORT_QTY == 0:
        print('Processed %d input line' % ln)

print('Processed %d input lines' % ln)

writeQrels(qrelList, outFileQrelsName)

inpFile.close()
outFileQueries.close()

示例#3
0
    if len(fields) != 2:
        print('Misformated line %d ignoring:' % ln)
        print(line.replace('\t', '<field delimiter>'))
        continue

    did, query = fields

    query_lemmas, query_unlemm = nlp.procText(query)

    query_toks = query_lemmas.split()
    if len(query_toks) >= minQueryTokQty:
        doc = {
            DOCID_FIELD: did,
            TEXT_FIELD_NAME: query_lemmas,
            TEXT_UNLEMM_FIELD_NAME: query_unlemm,
            TEXT_RAW_FIELD_NAME: query.lower()
        }
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bertTokenizer)

        docStr = json.dumps(doc) + '\n'
        outFile.write(docStr)

    if ln % REPORT_QTY == 0:
        print('Processed %d queries' % ln)

print('Processed %d queries' % ln)

inpFile.close()
outFile.close()
            # Doing it after lower-casing
            answ_bert_tok = None
            if bertTokenizer:
                answ_bert_tok = getRetokenized(bertTokenizer, answ)

            doc = {DOCID_FIELD: aid,
                   TEXT_FIELD_NAME: answ_lemmas,
                   TEXT_UNLEMM_FIELD_NAME: answ_unlemm,
                   TEXT_RAW_FIELD_NAME: answ}

            if answ_bert_tok is not None:
                doc[TEXT_BERT_TOKENIZED_NAME] = answ_bert_tok

            docStr = json.dumps(doc) + '\n'
            dataAnswFile.write(docStr)

            relGrade = MAX_RELEV_GRADE - int(i != rec.bestAnswerId)
            qrelFile.write(genQrelStr(qid, aid, relGrade) + '\n')

            if biQuestFiles and biAnswFiles:
                biQuestFiles[TEXT_FIELD_NAME].write(question_lemmas + '\n')
                biQuestFiles[TEXT_UNLEMM_FIELD_NAME].write(question_lemmas + '\n')

                biAnswFiles[TEXT_FIELD_NAME].write(answ_lemmas + '\n')
                biAnswFiles[TEXT_UNLEMM_FIELD_NAME].write(answ_lemmas + '\n')

                if bertTokenizer is not None:
                    biQuestFiles[TEXT_BERT_TOKENIZED_NAME].write(question_bert_tok + '\n')
                    biAnswFiles[TEXT_BERT_TOKENIZED_NAME].write(answ_bert_tok + '\n')
        return json.dumps(doc)


inp_file = FileWrapper(args.input_file)
out_file = FileWrapper(args.out_file, 'w')

proc_qty = args.proc_qty
print(f'Spanning {proc_qty} processes')
pool = multiprocessing.Pool(processes=proc_qty)
ln = 0
ln_ign = 0
for doc_str in pool.imap(PassParseWorker(), inp_file, IMAP_PROC_CHUNK_QTY):
    ln = ln + 1

    if doc_str is not None:
        if doc_str:
            out_file.write(doc_str + '\n')
        else:
            ln_ign += 1
    else:
        print('Ignoring misformatted line %d' % ln)

    if ln % REPORT_QTY == 0:
        print('Read %d passages, processed %d passages' % (ln, ln - ln_ign))

print('Processed %d passages' % ln)

inp_file.close()
out_file.close()

    query_bert_tok = None

    query_toks = query_lemmas.split()
    if len(query_toks) >= minQueryTokQty:
        doc = {
            DOCID_FIELD: query_idx,
            TEXT_FIELD_NAME: query_lemmas,
            TEXT_UNLEMM_FIELD_NAME: query_unlemm,
            TEXT_RAW_FIELD_NAME: query.lower()
        }
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer)
        if TEXT_BERT_TOKENIZED_NAME in doc:
            query_bert_tok = doc[TEXT_BERT_TOKENIZED_NAME]

        docStr = json.dumps(doc) + '\n'
        outQueries.write(docStr)

        for entry in fields["positive_ctxs"]:
            psgId = get_passage_id(entry)
            outQrels.write(f'{query_idx} 0 {psgId} 1\n')
            if biQuestFiles and biAnswFiles:
                title_text = entry["title"]
                if title_text:
                    _, title_unlemm = nlp.procText(title_text)
                    biQuestFiles[TITLE_UNLEMM_FIELD_NAME].write(query_unlemm + '\n')
                    biAnswFiles[TITLE_UNLEMM_FIELD_NAME].write(title_unlemm + '\n')

                for ctx_sent in sentSplit(entry["text"]):
                    ctx_sent = str(ctx_sent)
                    ctx_sent_lc = ctx_sent.lower()
                    # This is sometimes can be a false positive, b/c it doesn't
    query_toks = query_lemmas.split()
    if len(query_toks) >= min_query_tok_qty:
        doc = {
            DOCID_FIELD: query_idx,
            TEXT_FIELD_NAME: query_lemmas,
            TEXT_UNLEMM_FIELD_NAME: query_unlemm,
            TEXT_RAW_FIELD_NAME: query_orig,
            ANSWER_LIST_FIELD_NAME: answer_list
        }
        add_retokenized_field(doc, TEXT_RAW_FIELD_NAME,
                              TEXT_BERT_TOKENIZED_NAME, bert_tokenizer)
        if TEXT_BERT_TOKENIZED_NAME in doc:
            query_bert_tok = doc[TEXT_BERT_TOKENIZED_NAME]

        doc_str = json.dumps(doc) + '\n'
        out_queries.write(doc_str)

        for entry in fields["positive_ctxs"]:
            psg_id = get_passage_id(entry)
            add_qrel_entry(qrel_dict=glob_qrel_dict,
                           qid=query_idx,
                           did=psg_id,
                           grade=1)
            if bi_quest_files and bi_answ_files:
                title_text = entry["title"]
                if title_text:
                    _, title_unlemm = nlp.proc_text(title_text)
                    bi_quest_files[TITLE_UNLEMM_FIELD_NAME].write(
                        query_unlemm + '\n')
                    bi_answ_files[TITLE_UNLEMM_FIELD_NAME].write(title_unlemm +
                                                                 '\n')
示例#8
0
                            bertTokenizer)
        return json.dumps(doc)


inpFile = FileWrapper(args.input_file)
outFile = FileWrapper(args.out_file, 'w')

proc_qty = args.proc_qty
print(f'Spanning {proc_qty} processes')
pool = multiprocessing.Pool(processes=proc_qty)
ln = 0
ln_ign = 0
for docStr in pool.imap(PassParseWorker(), inpFile, IMAP_PROC_CHUNK_QTY):
    ln = ln + 1

    if docStr is not None:
        if docStr:
            outFile.write(docStr + '\n')
        else:
            ln_ign += 1
    else:
        print('Ignoring misformatted line %d' % ln)

    if ln % REPORT_QTY == 0:
        print('Read %d passages, processed %d passages' % (ln, ln - ln_ign))

print('Processed %d passages' % ln)

inpFile.close()
outFile.close()
    doc = json.loads(line)
    textRaw = doc[TEXT_RAW_FIELD_NAME]

    docSents = []

    for oneSent in nlp(textRaw).sents:
        oneSent = replaceCharsNL(str(oneSent)).strip()
        if args.lower_case:
            oneSent = oneSent.lower()
        if oneSent:
            docSents.append(oneSent)

    # Work hard to not write empty documents, b/c it'll upset the pregenerator
    if docSents:
        for oneSent in docSents:
            outFile.write(oneSent + '\n')
        outFile.write('\n')

    docQty += 1
    setQty += 1
    if docQty % REPORT_QTY == 0:
        print('Processed %d docs' % docQty)

    if setQty >= args.max_set_size:
        setQty = 0
        setId += 1
        print('Starting set %d' % setId)
        outFile.close()
        outFile = FileWrapper(outFileName(args.output_pref, setId), 'w')

print('Processed %d docs' % docQty)
示例#10
0
print('A list of queries to ignore has %d entries' % (len(ignore_queries)))

if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)

out_file_queries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON),
                               'w')

read_qty = 0
wrote_qty = 0

for e in read_queries(os.path.join(args.input_dir, QUESTION_FILE_JSON)):
    read_qty += 1
    if not TEXT_FIELD_NAME in e:
        continue

    text = e[TEXT_FIELD_NAME]
    if text in ignore_queries:
        print(
            f"Ignoring query, which is found in specified query files: {text}'"
        )
        continue

    wrote_qty += 1
    out_file_queries.write(json.dumps(e) + '\n')

ignored_qty = read_qty - wrote_qty
print(f'Wrote {wrote_qty} queries, ignored {ignored_qty} queries')

out_file_queries.close()
            answ_bert_tok = None
            if bert_tokenizer:
                answ_bert_tok = get_retokenized(bert_tokenizer, answ_lc)

            doc = {
                DOCID_FIELD: aid,
                TEXT_FIELD_NAME: answ_lemmas,
                TEXT_UNLEMM_FIELD_NAME: answ_unlemm,
                TEXT_RAW_FIELD_NAME: answ_orig
            }

            if answ_bert_tok is not None:
                doc[TEXT_BERT_TOKENIZED_NAME] = answ_bert_tok

            doc_str = json.dumps(doc) + '\n'
            data_answ_file.write(doc_str)

            rel_grade = MAX_RELEV_GRADE - int(i != rec.best_answer_id)
            qrel_file.write(gen_qrel_str(qid, aid, rel_grade) + '\n')

            if bi_quest_files and bi_answ_files:
                bi_quest_files[TEXT_FIELD_NAME].write(question_lemmas + '\n')
                bi_quest_files[TEXT_UNLEMM_FIELD_NAME].write(question_lemmas +
                                                             '\n')

                bi_answ_files[TEXT_FIELD_NAME].write(answ_lemmas + '\n')
                bi_answ_files[TEXT_UNLEMM_FIELD_NAME].write(answ_lemmas + '\n')

                if bert_tokenizer is not None:
                    bi_quest_files[TEXT_BERT_TOKENIZED_NAME].write(
                        question_bert_tok + '\n')
        qrel_list.append(QrelEntry(query_id=qid, doc_id=did, rel_grade=1))

        # Entries are sorted by the query ID
        if prev_qid != qid:
            doc = {
                DOCID_FIELD: qid,
                TEXT_FIELD_NAME: query_lemmas,
                TEXT_UNLEMM_FIELD_NAME: query_unlemm,
                TEXT_RAW_FIELD_NAME: query_orig
            }
            add_retokenized_field(doc, TEXT_RAW_FIELD_NAME,
                                  TEXT_BERT_TOKENIZED_NAME, bert_tokenizer)

            doc_str = json.dumps(doc) + '\n'
            out_file_queries.write(doc_str)
            gen_query_qty += 1
            if gen_query_qty >= max_query_qty:
                break

    prev_qid = qid

    if ln % REPORT_QTY == 0:
        print('Processed %d input line' % ln)

print('Processed %d input lines' % ln)

write_qrels(qrel_list, out_file_qrels_name)

inp_file.close()
out_file_queries.close()
    doc = json.loads(line)
    text_raw = doc[TEXT_RAW_FIELD_NAME]

    doc_sents = []

    for one_sent in nlp(text_raw).sents:
        one_sent = replace_chars_nl(str(one_sent)).strip()
        if args.lower_case:
            one_sent = one_sent.lower()
        if one_sent:
            doc_sents.append(one_sent)

    # Work hard to not write empty documents, b/c it'll upset the pregenerator
    if doc_sents:
        for one_sent in doc_sents:
            out_file.write(one_sent + '\n')
        out_file.write('\n')

    doc_qty += 1
    set_qty += 1
    if doc_qty % REPORT_QTY == 0:
        print('Processed %d docs' % doc_qty)

    if set_qty >= args.max_set_size:
        set_qty = 0
        set_id += 1
        print('Starting set %d' % set_id)
        out_file.close()
        out_file = FileWrapper(out_file_name(args.output_pref, set_id), 'w')

print('Processed %d docs' % doc_qty)
示例#14
0
print('A list of queries to ignore has %d entries' % (len(ignoreQueries)))

if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)

outFileQueries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON), 'w')

readQty = 0
wroteQty = 0

for e in readQueries(os.path.join(args.input_dir, QUESTION_FILE_JSON)):
    readQty += 1
    if not TEXT_FIELD_NAME in e:
        continue

    text = e[TEXT_FIELD_NAME]
    if text in ignoreQueries:
        print(f"Ignoring query, which is found in specified query files: {text}'")
        continue

    wroteQty += 1
    outFileQueries.write(json.dumps(e) + '\n')


ignoredQty = readQty - wroteQty
print(f'Wrote {wroteQty} queries, ignored {ignoredQty} queries')

outFileQueries.close()