Пример #1
0
    def __call__(self, line):

        if not line:
            return None

        line = line.strip()
        if not line:
            return None

        fields = line.split('\t')
        if ' '.join(fields) == 'id text title':
            return ''

        assert len(fields) == 3, f"Wrong format fline: {line}"
        passId, rawText, title = fields

        if fltPassIds is not None:
            if passId not in fltPassIds:
                return ''

        textLemmas, textUnlemm = textProcessor.procText(rawText)
        titleLemmas, titleUnlemm = textProcessor.procText(title)

        doc = {
            DOCID_FIELD: passId,
            TEXT_FIELD_NAME: titleLemmas + ' ' + textLemmas,
            TITLE_UNLEMM_FIELD_NAME: titleUnlemm,
            TEXT_UNLEMM_FIELD_NAME: textUnlemm,
            TEXT_RAW_FIELD_NAME: titleUnlemm + ' ' + rawText.lower()
        }

        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bertTokenizer)
        return json.dumps(doc)
Пример #2
0
    def __call__(self, line):

        if not line:
            return None
        line = line[:maxDocSize]  # cut documents that are too long!
        fields = line.split('\t')
        if len(fields) != 4:
            return None

        did, url, title, body = fields

        url_pretok = pretokenizeUrl(url)

        url_lemmas, url_unlemm = nlp.procText(url_pretok)
        title_lemmas, title_unlemm = nlp.procText(title)
        body_lemmas, body_unlemm = nlp.procText(body)

        text = title_lemmas + ' ' + body_lemmas
        text = text.strip()
        text_raw = (title.strip() + ' ' + body.strip()).lower()
        doc = {
            DOCID_FIELD: did,
            'url': url_lemmas,
            'url_unlemm': url_unlemm,
            TEXT_FIELD_NAME: text,
            TITLE_FIELD_NAME: title_lemmas,
            TITLE_UNLEMM_FIELD_NAME: title_unlemm,
            'body': body_unlemm,
            TEXT_RAW_FIELD_NAME: text_raw
        }
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bertTokenizer)

        docStr = json.dumps(doc) + '\n'
        return docStr
Пример #3
0
    def __call__(self, line):

        if not line:
            return None

        line = line[:maxDocSize]  # cut documents that are too long!
        fields = line.split('\t')
        if len(fields) != 2:
            return None

        pid, body = fields

        text, text_unlemm = self.nlp.procText(body)

        doc = {
            DOCID_FIELD: pid,
            TEXT_FIELD_NAME: text,
            TEXT_UNLEMM_FIELD_NAME: text_unlemm,
            TEXT_RAW_FIELD_NAME: body.lower()
        }
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bertTokenizer)

        return json.dumps(doc) + '\n'
Пример #4
0
    if len(fields) != 2:
        print('Misformated line %d ignoring:' % ln)
        print(line.replace('\t', '<field delimiter>'))
        continue

    did, query = fields

    query_lemmas, query_unlemm = nlp.procText(query)

    query_toks = query_lemmas.split()
    if len(query_toks) >= minQueryTokQty:
        doc = {
            DOCID_FIELD: did,
            TEXT_FIELD_NAME: query_lemmas,
            TEXT_UNLEMM_FIELD_NAME: query_unlemm,
            TEXT_RAW_FIELD_NAME: query.lower()
        }
        addRetokenizedField(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME,
                            bertTokenizer)

        docStr = json.dumps(doc) + '\n'
        outFile.write(docStr)

    if ln % REPORT_QTY == 0:
        print('Processed %d queries' % ln)

print('Processed %d queries' % ln)

inpFile.close()
outFile.close()