示例#1
0
def readingFunctionNQ(input):
    finp = FileWrapper(input)

    seenIds = set()

    for line in finp:
        root = json.loads(line)
        doc = root['document_html'].encode(DEFAULT_ENCODING)
        questionText = root["question_text"]
        answerList = []
        qid = root['example_id']
        if qid in seenIds:
            raise Exception(
                'Data inconsistency, repeating example/question ID' + qid)

        seenIds.add(qid)
        for oneAnnot in root['annotations']:
            for shortAnsw in oneAnnot['short_answers']:
                oneAnsw = doc[shortAnsw['start_byte']:
                              shortAnsw['end_byte']].decode(DEFAULT_ENCODING)
                if len(oneAnsw.split()) <= MAX_ANSWER_TOK_QTY:
                    answerList.append(oneAnsw)

        if answerList:
            yield qid, questionText, answerList
示例#2
0
def write_qrels_files(qrels, query_id_to_partition, dst_dir, partitions_names):
    files = [FileWrapper(os.path.join(dst_dir, name, QREL_FILE), "w") for name in partitions_names]

    for qrel in qrels:
        query_id = int(qrel.queryId)
        partition_id = query_id_to_partition[query_id]
        files[partition_id].write(qrelEntry2Str(qrel))
        files[partition_id].write('\n')

    for file in files:
        file.close()
示例#3
0
def write_queries_files(queries, query_id_to_partition, dst_dir, partitions_names):
    files = [FileWrapper(os.path.join(dst_dir, name, QUESTION_FILE_JSON), "w") for name in partitions_names]

    for query in queries:
        query_id = query[DOCID_FIELD]
        partition_id = query_id_to_partition[query_id]
        files[partition_id].write(json.dumps(query))
        files[partition_id].write('\n')

    for file in files:
        file.close()
示例#4
0
def read_run_dict(file_name):
    """Read a run file in the form of a dictionary where keys are query IDs.

    :param file_name: run file name
    :return:
    """
    result = {}
    with FileWrapper(file_name) as f:
        for ln, line in enumerate(tqdm(f, desc='loading run (by line)', leave=False)):
            line = line.strip()
            if not line:
                continue
            fld = line.split()
            if len(fld) != 6:
                ln += 1
                raise Exception(
                    f'Invalid line {ln} in run file {file_name} expected 6 white-space separated fields by got: {line}')

            qid, _, docid, rank, score, _ = fld
            result.setdefault(qid, {})[docid] = float(score)

    return result
def read_cranfield_data(file):
    res = []
    curr_entry = None
    curr_text = None
    all_text = None
    prev_field = None
    with FileWrapper(file) as f:
        for line in f:
            if line.startswith('.I '):
                if curr_entry:
                    assert curr_text is not None
                    curr_entry[FIELD_MAP[prev_field]] = curr_text.strip()
                    assert all_text is not None
                    curr_entry[TEXT_RAW_FIELD_NAME] = all_text
                    res.append(curr_entry)
                curr_entry = {DOCID_FIELD: line[3:].strip()}
                curr_text = ''
                all_text = ''
                prev_field = None
            else:
                all_text += line
                line_stripped = line.strip()
                if line_stripped in FIELD_MAP:
                    if prev_field is not None:
                        assert curr_text is not None
                        curr_entry[FIELD_MAP[prev_field]] = curr_text.strip()
                    prev_field = line_stripped
                    curr_text = ''
                else:
                    curr_text += line

    if curr_entry:
        assert curr_text is not None
        curr_entry[FIELD_MAP[prev_field]] = curr_text.strip()
        assert all_text is not None
        curr_entry[TEXT_RAW_FIELD_NAME] = all_text
        res.append(curr_entry)

    return res
    bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(BERT_BASE_MODEL)
    bitext_fields.append(TEXT_BERT_TOKENIZED_NAME)

if not os.path.exists(outMainDir):
    os.makedirs(outMainDir)

biQuestFiles = {}
biAnswFiles = {}

stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
print(stopWords)
nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True, enablePOS=False)

dataQuestFile = open(os.path.join(outMainDir, QUESTION_FILE_JSON), 'w')
# File wrapper can handle output gz files
dataAnswFile = FileWrapper(os.path.join(outMainDir, ANSWER_FILE_JSON), flags='w')
qrelFile = open(os.path.join(outMainDir, QREL_FILE), 'w')

if outBitextDir:
    if not os.path.exists(outBitextDir):
        os.makedirs(outBitextDir)

    for fn in bitext_fields:
        biQuestFiles[fn] = open(os.path.join(outBitextDir, BITEXT_QUESTION_PREFIX + fn), 'w')
        biAnswFiles[fn] = open(os.path.join(outBitextDir, BITEXT_ANSWER_PREFIX + fn), 'w')

ln = 0
for recStr in SimpleXmlRecIterator(inpFileName, 'document'):
    ln += 1
    try:
        rec = procYahooAnswersRecord(recStr)
                        metavar=OUT_BITEXT_PATH_OPT_META,
                        help=OUT_BITEXT_PATH_OPT_HELP,
                        type=str,
                        default=None)
    parser.add_argument('--' + BERT_TOK_OPT,
                        action='store_true',
                        help=BERT_TOK_OPT_HELP)

    args = parser.parse_args()

    return args


args = parse_args()
arg_vars = vars(args)
inp_file = FileWrapper(args.input)
out_queries = FileWrapper(args.output_queries, 'w')
min_query_tok_qty = args.min_query_token_qty
use_precomputed_negatives = args.use_precomputed_negatives
stop_words = read_stop_words(STOPWORD_FILE, lower_case=True)
out_bitext_dir = arg_vars[OUT_BITEXT_PATH_OPT]
nlp = SpacyTextParser(SPACY_MODEL,
                      stop_words,
                      keep_only_alpha_num=True,
                      lower_case=True)
sent_split = Sentencizer(SPACY_MODEL)

bitext_fields = [
    TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME
]
# Default is: Number of cores minus one for the spaning process
parser.add_argument('--proc_qty',
                    metavar='# of processes',
                    help='# of NLP processes to span',
                    type=int,
                    default=multiprocessing.cpu_count() - 1)
parser.add_argument('--' + BERT_TOK_OPT,
                    action='store_true',
                    help=BERT_TOK_OPT_HELP)

args = parser.parse_args()
print(args)
arg_vars = vars(args)

inp_file = FileWrapper(args.input)
out_file = FileWrapper(args.output, 'w')
max_doc_size = args.max_doc_size

stop_words = read_stop_words(STOPWORD_FILE, lower_case=True)
print(stop_words)

bert_tokenizer = None
if arg_vars[BERT_TOK_OPT]:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)

nlp = SpacyTextParser(SPACY_MODEL,
                      stop_words,
                      keep_only_alpha_num=True,
示例#9
0
                    type=str,
                    required=True)
parser.add_argument('--min_query_token_qty',
                    type=int,
                    default=0,
                    metavar='min # of query tokens',
                    help='ignore queries that have smaller # of tokens')
parser.add_argument('--' + BERT_TOK_OPT,
                    action='store_true',
                    help=BERT_TOK_OPT_HELP)

args = parser.parse_args()
print(args)
arg_vars = vars(args)

inpFile = FileWrapper(args.input)
outFile = FileWrapper(args.output, 'w')
minQueryTokQty = args.min_query_token_qty

stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
print(stopWords)
nlp = SpacyTextParser(SPACY_MODEL,
                      stopWords,
                      keepOnlyAlphaNum=True,
                      lowerCase=True)

if arg_vars[BERT_TOK_OPT]:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)
                    type=str, required=True)
parser.add_argument('--filter_query_dir', metavar='filtering query dir',
                    default=[],
                    help=f'all queries found in {QUESTION_FILE_JSON} files from these directories are ignored',
                    nargs='*')
parser.add_argument('--out_dir', metavar='output directory', help='output directory',
                    type=str, required=True)
parser.add_argument('--min_query_token_qty', type=int, default=0,
                    metavar='min # of query tokens', help='ignore queries that have smaller # of tokens')
parser.add_argument('--' + BERT_TOK_OPT, action='store_true', help=BERT_TOK_OPT_HELP)

args = parser.parse_args()
print(args)
arg_vars = vars(args)

inpFile = FileWrapper(args.input)

ignoreQueries = set()

for qfile_dir in args.filter_query_dir:
    qfile_name = os.path.join(qfile_dir, QUESTION_FILE_JSON)
    for e in readQueries(qfile_name):
        ignoreQueries.add(e[TEXT_FIELD_NAME])
    print('Read queries from: ' + qfile_name)

print('A list of queries to ignore has %d entries' % (len(ignoreQueries)))


if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)
示例#11
0
ignore_queries = set()

for qfile_dir in args.filter_query_dir:
    qfile_name = os.path.join(qfile_dir, QUESTION_FILE_JSON)
    for e in read_queries(qfile_name):
        if not TEXT_FIELD_NAME in e:
            continue
        ignore_queries.add(e[TEXT_FIELD_NAME])
    print('Read queries from: ' + qfile_name)

print('A list of queries to ignore has %d entries' % (len(ignore_queries)))

if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)

out_file_queries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON),
                               'w')

read_qty = 0
wrote_qty = 0

for e in read_queries(os.path.join(args.input_dir, QUESTION_FILE_JSON)):
    read_qty += 1
    if not TEXT_FIELD_NAME in e:
        continue

    text = e[TEXT_FIELD_NAME]
    if text in ignore_queries:
        print(
            f"Ignoring query, which is found in specified query files: {text}'"
        )
        continue
    os.makedirs(out_main_dir)

bi_quest_files = {}
bi_answ_files = {}

stop_words = read_stop_words(STOPWORD_FILE, lower_case=True)
print(stop_words)
nlp = SpacyTextParser(SPACY_MODEL,
                      stop_words,
                      keep_only_alpha_num=True,
                      lower_case=True,
                      enable_pos=False)

data_quest_file = open(os.path.join(out_main_dir, QUESTION_FILE_JSON), 'w')
# File wrapper can handle output gz files
data_answ_file = FileWrapper(os.path.join(out_main_dir, ANSWER_FILE_JSON),
                             flags='w')
qrel_file = open(os.path.join(out_main_dir, QREL_FILE), 'w')

if out_bitext_dir:
    if not os.path.exists(out_bitext_dir):
        os.makedirs(out_bitext_dir)

    for fn in bitext_fields:
        bi_quest_files[fn] = open(
            os.path.join(out_bitext_dir, BITEXT_QUESTION_PREFIX + fn), 'w')
        bi_answ_files[fn] = open(
            os.path.join(out_bitext_dir, BITEXT_ANSWER_PREFIX + fn), 'w')

ln = 0
for rec_str in SimpleXmlRecIterator(inp_file_name, 'document'):
    ln += 1
                    type=str, required=True)
parser.add_argument('--max_set_size', metavar='max # of documents in a set',
                    default=1000_000,
                    help='the maximum number of set (in documents)',
                    type=int)
parser.add_argument('--lower_case', help='lowercase text',
                    action='store_true', default=False)

args = parser.parse_args()
print(args)

doc_qty = 0
set_qty = 0
set_id = 0

inp_file = FileWrapper(args.input)

nlp = SpacyTextParser(SPACY_MODEL, [], sent_split=True)


def out_file_name(pref, num):
    return pref + str(num) + '.txt'


print('Starting set 0')
out_file = FileWrapper(out_file_name(args.output_pref, set_id), 'w')

for line in inp_file:
    doc = json.loads(line)
    text_raw = doc[TEXT_RAW_FIELD_NAME]
示例#14
0
args = parser.parse_args()
print(args)

stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
print(stopWords)

nlp = SpacyTextParser(SPACY_MODEL,
                      stopWords,
                      keepOnlyAlphaNum=True,
                      lowerCase=True)

doc_id_prev = None
predicted_queries = []

for doc_id, predicted_queries_partial in tqdm(zip(
        FileWrapper(args.doc_ids_path), FileWrapper(args.predictions_path)),
                                              desc='reading predictions'):
    doc_id = doc_id.strip()
    if doc_id_prev is not None and doc_id_prev != doc_id:
        if predicted_queries and doc_id_prev is not None:
            docid_to_preds[doc_id_prev] = ' '.join(predicted_queries).strip()
        predicted_queries = []

    doc_id_prev = doc_id
    predicted_queries.append(predicted_queries_partial)

# Not forgetting about the last batch
if predicted_queries and doc_id_prev is not None:
    docid_to_preds[doc_id_prev] = ' '.join(predicted_queries)

with FileWrapper(args.output, 'w') as outf:
示例#15
0
                return ''

        text_lemmas, text_unlemm = text_processor.proc_text(raw_text)
        title_lemmas, title_unlemm = text_processor.proc_text(title)

        doc = {DOCID_FIELD: pass_id,
               TEXT_FIELD_NAME: title_lemmas + ' ' + text_lemmas,
               TITLE_UNLEMM_FIELD_NAME: title_unlemm,
               TEXT_UNLEMM_FIELD_NAME: text_unlemm,
               TEXT_RAW_FIELD_NAME: title_unlemm + ' ' + raw_text}

        add_retokenized_field(doc, TEXT_RAW_FIELD_NAME, TEXT_BERT_TOKENIZED_NAME, bert_tokenizer)
        return json.dumps(doc)


inp_file = FileWrapper(args.input_file)
out_file = FileWrapper(args.out_file, 'w')

proc_qty = args.proc_qty
print(f'Spanning {proc_qty} processes')
pool = multiprocessing.Pool(processes=proc_qty)
ln = 0
ln_ign = 0
for doc_str in pool.imap(PassParseWorker(), inp_file, IMAP_PROC_CHUNK_QTY):
    ln = ln + 1

    if doc_str is not None:
        if doc_str:
            out_file.write(doc_str + '\n')
        else:
            ln_ign += 1
示例#16
0
                    type=float,
                    help=f'a probability to sample non-relevant document entries',
                    required=True)


args = parser.parse_args()

sample_prob = args.nonrel_sample_prob

if sample_prob < 0 or sample_prob >= 1:
    print('Sampling probability must be >=0 and < 1')
    sys.exit(1)


qrelDict = readQrelsDict(os.path.join(args.qrel_dir, QREL_FILE))

allRelDocs = set()

for qid, qd in qrelDict.items():
    for did, rel in qd.items():
        if rel >= args.min_rel_grade:
            allRelDocs.add(did)


with FileWrapper(args.out_doc_file, 'w') as outFile:
    for docEntry in jsonlGen(args.inp_doc_file):
        did = docEntry[DOCID_FIELD]
        if did in allRelDocs or random.random() < sample_prob:
            outFile.write(json.dumps(docEntry) + '\n')

示例#17
0
    args = parser.parse_args()

    return args


args = parse_args()
arg_vars = vars(args)

sel_psg_ids = set()

np.random.seed(0)

for inp_file in args.input:
    print(f'Processing {inp_file}')
    for fields in tqdm.tqdm(dpr_json_reader(FileWrapper(inp_file))):
        pos_ids = set()
        neg_ids = set()

        for entry in fields["positive_ctxs"]:
            pos_ids.add(get_passage_id(entry))

        for entry in fields["negative_ctxs"]:
            neg_ids.add(get_passage_id(entry))

        for entry in fields["hard_negative_ctxs"]:
            neg_ids.add(get_passage_id(entry))

        for psg_id in pos_ids:
            sel_psg_ids.add(psg_id)
示例#18
0
stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
#print(stopWords)

bert_tokenizer = None
if arg_vars[BERT_TOK_OPT]:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bert_tokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(
        BERT_BASE_MODEL)

nlp = SpacyTextParser(SPACY_MODEL,
                      stopWords,
                      keepOnlyAlphaNum=True,
                      lowerCase=True)

with FileWrapper(args.output, 'w') as outf:
    for doc in tqdm(inp_data, desc='converting documents'):
        e = {
            DOCID_FIELD: doc[DOCID_FIELD],
            TEXT_RAW_FIELD_NAME: doc[TEXT_RAW_FIELD_NAME]
        }

        title_lemmas, _ = nlp.procText(doc[TITLE_FIELD_NAME])
        author_lemmas, _ = nlp.procText(doc[AUTHOR_FIELD_NAME])
        venue_lemmas, _ = nlp.procText(doc[VENUE_FIELD_NAME])
        body_lemmas, _ = nlp.procText(doc[BODY_FIED_NAME])

        e[TEXT_FIELD_NAME] = ' '.join(
            [title_lemmas, author_lemmas, venue_lemmas, body_lemmas])
        e[TITLE_FIELD_NAME] = title_lemmas
        e[AUTHOR_FIELD_NAME] = author_lemmas
示例#19
0
print('Reading document IDs from the index')
all_doc_ids = read_doc_ids_from_forward_file_header(args.fwd_index_file)
print('Reading queries')
queries = read_queries(args.query_file)

query_ids = []
query_doc_qtys = {}

for e in queries:
    qid = e[DOCID_FIELD]
    query_ids.append(qid)

# Some copy-paste from common_eval.read_run_dict, but ok for now
file_name = args.run_file
with FileWrapper(file_name) as f:
    prev_query_id = None

    # Check for repeating document IDs and improperly sorted entries
    for ln, line in enumerate(f):
        line = line.strip()
        if not line:
            continue
        fld = line.split()
        if len(fld) != 6:
            ln += 1
            raise Exception(
                f'Invalid line {ln} in run file {file_name} expected 6 white-space separated fields by got: {line}'
            )

        qid, _, docid, rank, score_str, run_id = fld
示例#20
0
print('Reading document IDs from the index')
allDocIds = readDocIdsFromForwardFileHeader(args.fwd_index_file)
print('Reading queries')
queries = readQueries(args.query_file)

query_ids = []
query_doc_qtys = {}

for e in queries:
    qid = e[DOCID_FIELD]
    query_ids.append(qid)

# Some copy-paste from common_eval.readRunDict, but ok for now
fileName = args.run_file
with FileWrapper(fileName) as f:
    prevQueryId = None

    # Check for repeating document IDs and improperly sorted entries
    for ln, line in enumerate(f):
        line = line.strip()
        if not line:
            continue
        fld = line.split()
        if len(fld) != 6:
            ln += 1
            raise Exception(
                f'Invalid line {ln} in run file {fileName} expected 6 white-space separated fields by got: {line}')

        qid, _, docid, rank, scoreStr, runId = fld
        if prevQueryId is None or qid != prevQueryId:
                    type=str, required=True)
parser.add_argument('--max_set_size', metavar='max # of documents in a set',
                    default=1000_000,
                    help='the maximum number of set (in documents)',
                    type=int)
parser.add_argument('--lower_case', help='lowercase text',
                    action='store_true', default=False)

args = parser.parse_args()
print(args)

docQty = 0
setQty = 0
setId = 0

inpFile = FileWrapper(args.input)

nlp = SpacyTextParser(SPACY_MODEL, [], sentSplit=True)


def outFileName(pref, num):
    return pref + str(num) + '.txt'


print('Starting set 0')
outFile = FileWrapper(outFileName(args.output_pref, setId), 'w')

for line in inpFile:
    doc = json.loads(line)
    textRaw = doc[TEXT_RAW_FIELD_NAME]
示例#22
0
def main():
    parser = argparse.ArgumentParser(description='Split raw DPR queries.')
    add_basic_query_split_args(parser)

    parser.add_argument('--src_file',
                        metavar='input file name',
                        help='input file name',
                        type=str, required=True)
    parser.add_argument('--dst_file_pref',
                        metavar='output file prefix',
                        help='output file prefix',
                        type=str, required=True)

    args = QuerySplitArguments(parser.parse_args())

    print(args.raw_args)

    print("Reading input files...")
    src_file = args.src_file

    query_ids = []

    # First time we read the input file to count the number of queries
    with FileWrapper(src_file) as inp_file:
        for query_idx, _ in tqdm.tqdm(enumerate(dpr_json_reader(inp_file))):
            query_ids.append(query_idx)

    random.seed(args.seed)
    random.shuffle(query_ids)

    print(f"Shuffled query IDs using sid {args.seed}")

    sizes = args.partitions_sizes(len(query_ids))
    assert len(sizes) == len(args.partitions_names)
    print("Final partitions sizes:", list(zip(args.partitions_names, sizes)))

    query_id_to_partition = build_query_id_to_partition(query_ids, sizes)

    out_file_list = [None] * len(args.partitions_names)
    max_query_idx = [-1] * len(args.partitions_names)

    for part_id, part_name in enumerate(args.partitions_names):
        out_file_name = args.dst_file_pref + '_' + part_name + '.json.gz'
        out_file_list[part_id] = FileWrapper(out_file_name, 'w')
        out_file_list[part_id].write('[\n')

    # Due to specifics of formatting of the DPR files, we need to put comma
    # right after the } that "finalizes" a question.
    # However, the last } in the file shouldn't be followed by a comma.
    # To implement this, we need to know the maximum query ID in a partition
    for query_id, part_id in query_id_to_partition.items():
        max_query_idx[part_id] = max(max_query_idx[part_id], query_id)

    print('Actually splitting data')

    # First time we read the input file to actually split things
    with FileWrapper(src_file) as inp_file:
        for query_idx, json_str in tqdm.tqdm(enumerate(dpr_json_reader(inp_file))):
            part_id = query_id_to_partition[query_idx]
            out_file = out_file_list[part_id]
            if query_idx < max_query_idx[part_id]:
                out_file.write(json_str + ',\n')
            else:
                # Final entry shouldn't be followed by a comma
                out_file.write(json_str + '\n')


    for out_file in out_file_list:
        out_file.write(']\n')
        out_file.close()
示例#23
0
    parser.add_argument('--use_precomputed_negatives', type=bool, default=False, help='Use negative_ctxs field as a source for negative examples')
    parser.add_argument('--min_query_token_qty', type=int, default=0,
                        metavar='min # of query tokens', help='ignore queries that have smaller # of tokens')
    parser.add_argument('--' + OUT_BITEXT_PATH_OPT, metavar=OUT_BITEXT_PATH_OPT_META,
                        help=OUT_BITEXT_PATH_OPT_HELP,
                        type=str, default=None)
    parser.add_argument('--' + BERT_TOK_OPT, action='store_true', help=BERT_TOK_OPT_HELP)

    args = parser.parse_args()

    return args


args = parse_args()
arg_vars=vars(args)
inpFile = FileWrapper(args.input)
outQueries = FileWrapper(args.output_queries, 'w')
outQrels = FileWrapper(args.output_qrels, 'w')
minQueryTokQty = args.min_query_token_qty
usePrecomputedNegatives = args.use_precomputed_negatives
stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
outBitextDir = arg_vars[OUT_BITEXT_PATH_OPT]
nlp = SpacyTextParser(SPACY_MODEL, stopWords, keepOnlyAlphaNum=True, lowerCase=True)
sentSplit = Sentencizer(SPACY_MODEL)

bitext_fields = [TEXT_FIELD_NAME, TEXT_UNLEMM_FIELD_NAME, TITLE_UNLEMM_FIELD_NAME]

bertTokenizer=None
if BERT_TOK_OPT in arg_vars:
    print('BERT-tokenizing input into the field: ' + TEXT_BERT_TOKENIZED_NAME)
    bertTokenizer = pytorch_pretrained_bert.BertTokenizer.from_pretrained(BERT_BASE_MODEL)