class DocParseWorker:
    def __init__(self, stopWords, spacyModel):
        self.nlp = SpacyTextParser(spacyModel, stopWords, keepOnlyAlphaNum=True, lowerCase=True)

    def __call__(self, line):

        if not line:
            return None
        line = line[:maxDocSize]  # cut documents that are too long!
        fields = line.split('\t')
        if len(fields) != 4:
            return None

        did, url, title, body = fields

        title_lemmas, title_unlemm = self.nlp.procText(title)
        body_lemmas, body_unlemm = self.nlp.procText(body)

        text = title_lemmas + ' ' + body_lemmas
        text = text.strip()
        text_raw = (title.strip() + ' ' + body.strip()).lower()
        doc = {DOCID_FIELD: did,
               TEXT_FIELD_NAME: text,
               TITLE_UNLEMM_FIELD_NAME: title_unlemm,
               'body': body_unlemm,
               TEXT_RAW_FIELD_NAME: text_raw}
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer)

        docStr = json.dumps(doc) + '\n'
        return docStr
示例#2
0
class PassParseWorker:
    def __init__(self, stopWords, spacyModel):
        self.nlp = SpacyTextParser(spacyModel,
                                   stopWords,
                                   keepOnlyAlphaNum=True,
                                   lowerCase=True)

    def __call__(self, line):

        if not line:
            return None

        line = line[:maxDocSize]  # cut documents that are too long!
        fields = line.split('\t')
        if len(fields) != 2:
            return None

        pid, body = fields

        text, text_unlemm = self.nlp.procText(body)

        doc = {
            DOCID_FIELD: pid,
            TEXT_FIELD_NAME: text,
            TEXT_UNLEMM_FIELD_NAME: text_unlemm,
            TEXT_RAW_FIELD_NAME: body.lower()
        }
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bertTokenizer)

        return json.dumps(doc) + '\n'
示例#3
0
class PassParseWorker:
    def __init__(self, stopWords, spacyModel):
        # Lower cased
        self.textProcessor = SpacyTextParser(spacyModel,
                                             stopWords,
                                             keepOnlyAlphaNum=True,
                                             lowerCase=True,
                                             enablePOS=True)

    def __call__(self, line):

        if not line:
            return None

        line = line.strip()
        if not line:
            return None

        fields = line.split('\t')
        if ' '.join(fields) == 'id text title':
            return ''

        assert len(fields) == 3, f"Wrong format fline: {line}"
        passId, rawText, title = fields

        textLemmas, textUnlemm = self.textProcessor.procText(rawText)
        titleLemmas, titleUnlemm = self.textProcessor.procText(title)

        doc = {
            DOCID_FIELD: passId,
            TEXT_FIELD_NAME: titleLemmas + ' ' + textLemmas,
            TITLE_UNLEMM_FIELD_NAME: titleUnlemm,
            TEXT_UNLEMM_FIELD_NAME: textUnlemm,
            TEXT_RAW_FIELD_NAME: titleUnlemm + ' ' + rawText.lower()
        }

        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bertTokenizer)
        return json.dumps(doc)
示例#4
0
# Input file is a TSV file
ln = 0
for line in inpFile:
    ln += 1
    line = line.strip()
    if not line:
        continue
    fields = line.split('\t')
    if len(fields) != 2:
        print('Misformated line %d ignoring:' % ln)
        print(line.replace('\t', '<field delimiter>'))
        continue

    did, query = fields

    query_lemmas, query_unlemm = nlp.procText(query)

    query_toks = query_lemmas.split()
    if len(query_toks) >= minQueryTokQty:
        doc = {
            DOCID_FIELD: did,
            TEXT_FIELD_NAME: query_lemmas,
            TEXT_UNLEMM_FIELD_NAME: query_unlemm,
            TEXT_RAW_FIELD_NAME: query.lower()
        }
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bertTokenizer)

        docStr = json.dumps(doc) + '\n'
        outFile.write(docStr)
示例#5
0
    bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)

nlp = SpacyTextParser(SPACY_MODEL,
                      stopWords,
                      keepOnlyAlphaNum=True,
                      lowerCase=True)

with FileWrapper(args.output, 'w') as outf:
    for doc in tqdm(inp_data, desc='converting documents'):
        e = {
            DOCID_FIELD: doc[DOCID_FIELD],
            TEXT_RAW_FIELD_NAME: doc[TEXT_RAW_FIELD_NAME]
        }

        title_lemmas, _ = nlp.procText(doc[TITLE_FIELD_NAME])
        author_lemmas, _ = nlp.procText(doc[AUTHOR_FIELD_NAME])
        venue_lemmas, _ = nlp.procText(doc[VENUE_FIELD_NAME])
        body_lemmas, _ = nlp.procText(doc[BODY_FIED_NAME])

        e[TEXT_FIELD_NAME] = ' '.join(
            [title_lemmas, author_lemmas, venue_lemmas, body_lemmas])
        e[TITLE_FIELD_NAME] = title_lemmas
        e[AUTHOR_FIELD_NAME] = author_lemmas
        e[VENUE_FIELD_NAME] = venue_lemmas
        e[BODY_FIED_NAME] = body_lemmas

        addRetokenizedField(e, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bert_tokenizer)

        outf.write(json.dumps(e) + '\n')
示例#6
0
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)

nlp = SpacyTextParser(SPACY_MODEL,
                      stopWords,
                      keepOnlyAlphaNum=True,
                      lowerCase=True)

with FileWrapper(args.output, 'w') as outf:
    qid = 0
    for query in tqdm(inp_data, desc='converting queries'):
        # Cranfield query IDs are all wrong and don't match QRELs
        # In QRELs a query ID is simply
        qid += 1

        e = {
            DOCID_FIELD: str(qid),
            TEXT_RAW_FIELD_NAME: query[TEXT_RAW_FIELD_NAME]
        }

        body_lemmas, body_unlemm = nlp.procText(query[BODY_FIED_NAME])

        e[TEXT_FIELD_NAME] = body_lemmas
        e[BODY_FIED_NAME] = body_unlemm

        addRetokenizedField(e, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bert_tokenizer)

        outf.write(json.dumps(e) + '\n')
    for fn in bitext_fields:
        biQuestFiles[fn] = open(os.path.join(outBitextDir, BITEXT_QUESTION_PREFIX + fn), 'w')
        biAnswFiles[fn] = open(os.path.join(outBitextDir, BITEXT_ANSWER_PREFIX + fn), 'w')

ln = 0
for recStr in SimpleXmlRecIterator(inpFileName, 'document'):
    ln += 1
    try:
        rec = procYahooAnswersRecord(recStr)
        if len(rec.answerList) == 0:  # Ignore questions without answers
            continue

        question = (rec.subject + ' ' + rec.content).strip()
        qid = rec.uri

        question_lemmas, question_unlemm = nlp.procText(question)

        question = question.lower()  # after NLP

        question_bert_tok = None
        if bertTokenizer:
            question_bert_tok = getRetokenized(bertTokenizer, question)

        doc = {DOCID_FIELD: qid,
               TEXT_FIELD_NAME: question_lemmas,
               TEXT_UNLEMM_FIELD_NAME: question_unlemm,
               TEXT_RAW_FIELD_NAME: question}
        if question_bert_tok is not None:
            doc[TEXT_BERT_TOKENIZED_NAME] = question_bert_tok
        docStr = json.dumps(doc) + '\n'
        dataQuestFile.write(docStr)
biQuestFiles = {}
biAnswFiles = {}

if outBitextDir:
    if not os.path.exists(outBitextDir):
        os.makedirs(outBitextDir)

    for fn in bitext_fields:
        biQuestFiles[fn] = open(os.path.join(outBitextDir, BITEXT_QUESTION_PREFIX + fn), 'w')
        biAnswFiles[fn] = open(os.path.join(outBitextDir, BITEXT_ANSWER_PREFIX + fn), 'w')

for query_idx, fields in tqdm.tqdm(enumerate(dpr_json_reader(inpFile))):
    query = fields["question"]
    answer_list_lc = [s.lower() for s in fields["answers"]]
    query_lemmas, query_unlemm = nlp.procText(query)
    query_bert_tok = None

    query_toks = query_lemmas.split()
    if len(query_toks) >= minQueryTokQty:
        doc = {
            DOCID_FIELD: query_idx,
            TEXT_FIELD_NAME: query_lemmas,
            TEXT_UNLEMM_FIELD_NAME: query_unlemm,
            TEXT_RAW_FIELD_NAME: query.lower()
        }
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bertTokenizer)
        if TEXT_BERT_TOKENIZED_NAME in doc:
            query_bert_tok = doc[TEXT_BERT_TOKENIZED_NAME]

        docStr = json.dumps(doc) + '\n'
示例#9
0
predicted_queries = []

for doc_id, predicted_queries_partial in tqdm(zip(
        FileWrapper(args.doc_ids_path), FileWrapper(args.predictions_path)),
                                              desc='reading predictions'):
    doc_id = doc_id.strip()
    if doc_id_prev is not None and doc_id_prev != doc_id:
        if predicted_queries and doc_id_prev is not None:
            docid_to_preds[doc_id_prev] = ' '.join(predicted_queries).strip()
        predicted_queries = []

    doc_id_prev = doc_id
    predicted_queries.append(predicted_queries_partial)

# Not forgetting about the last batch
if predicted_queries and doc_id_prev is not None:
    docid_to_preds[doc_id_prev] = ' '.join(predicted_queries)

with FileWrapper(args.output, 'w') as outf:
    for doce in tqdm(jsonlGen(args.input), desc='adding doc2query fields'):
        doc_id = doce[DOCID_FIELD]
        if doc_id in docid_to_preds:
            text, text_unlemm = nlp.procText(docid_to_preds[doc_id])
            doce[TEXT_FIELD_NAME] = doce[TEXT_FIELD_NAME] + ' ' + text
            doce[DOC2QUERY_FIELD_TEXT] = text
            doce[DOC2QUERY_FIELD_TEXT_UNLEMM] = text_unlemm
        else:
            print(f'WARNING: no predictions for {doc_id}')

        outf.write(json.dumps(doce) + '\n')