Пример #1
0
#!/usr/bin/env python
import sys
import argparse

sys.path.append('.')

from scripts.data_convert.convert_common import jsonlGen

parser = argparse.ArgumentParser(
    description='Count tokens and number of entries in JSONL')

parser.add_argument('--input', type=str, required=True)
parser.add_argument('--field', type=str, required=True)

args = parser.parse_args()

qty = 0
tok_qty = 0
field = args.field

for e in jsonlGen(args.input):
    qty += 1
    if field in e:
        tok_qty += len(e[field].split())

print(qty, tok_qty)
Пример #2
0
                    required=True)
parser.add_argument('--input',
                    metavar='input files',
                    help='input JSONL files (possibly compressed)',
                    type=str,
                    nargs='+',
                    required=True)
parser.add_argument('--output',
                    metavar='output file',
                    help='output file',
                    type=str,
                    required=True)

args = parser.parse_args()
print(args)

vocab = VocabBuilder()
field = args.field_name

for fn in args.input:
    ln = 0
    for docEntry in tqdm(jsonlGen(fn), desc='Processing: ' + fn):
        ln += 1
        if field in docEntry:
            vocab.procDoc(docEntry[field])
        else:
            print(f'No field {field} is found in line {ln} file {fn}')
            sys.exit(1)

vocab.save(args.output)
Пример #3
0
                    type=float,
                    help=f'a probability to sample non-relevant document entries',
                    required=True)


args = parser.parse_args()

sample_prob = args.nonrel_sample_prob

if sample_prob < 0 or sample_prob >= 1:
    print('Sampling probability must be >=0 and < 1')
    sys.exit(1)


qrelDict = readQrelsDict(os.path.join(args.qrel_dir, QREL_FILE))

allRelDocs = set()

for qid, qd in qrelDict.items():
    for did, rel in qd.items():
        if rel >= args.min_rel_grade:
            allRelDocs.add(did)


with FileWrapper(args.out_doc_file, 'w') as outFile:
    for docEntry in jsonlGen(args.inp_doc_file):
        did = docEntry[DOCID_FIELD]
        if did in allRelDocs or random.random() < sample_prob:
            outFile.write(json.dumps(docEntry) + '\n')

Пример #4
0
apath1 = os.path.join(dataDir, args.input_subdir1, ANSWER_FILE_JSON)
apath2 = os.path.join(dataDir, args.input_subdir2, ANSWER_FILE_JSON)

rpath1 = os.path.join(dataDir, args.input_subdir1, QREL_FILE)
qrelDict1 = readQrelsDict(rpath1)
print('Read %d qrel sets from %s' % (len(qrelDict1), rpath1))
rpath2 = os.path.join(dataDir, args.input_subdir2, QREL_FILE)
qrelDict2 = readQrelsDict(rpath2)
print('Read %d qrel sets from %s' % (len(qrelDict2), rpath2))

answDictText = {}

for fn in [apath1, apath2]:
    qty = 0

    for e in tqdm(jsonlGen(fn), desc='loading answers'):
        qty += 1

        answId = e[DOCID_FIELD]
        answText = e[TEXT_RAW_FIELD_NAME]

        answDictText[answId] = answText

    print('Read %d answers from %s' % (qty, fn))

if args.use_hnsw:
    methodName = 'hnsw'
    M = 30
    efC = 200

    indexTimeParams = {
Пример #5
0
                    metavar='input file',
                    help='input JSONL file (can be gz or bz2 compressed)')
parser.add_argument('--output',
                    type=str,
                    required=True,
                    metavar='output file',
                    help='output JSONL file (can be gz or bz2 compressed)')
parser.add_argument(
    '--keep_fields',
    nargs='+',
    metavar='included fields',
    required=True,
    help=
    f'A list of fields to include, note that {DOCID_FIELD} is not filtered out.'
)

args = parser.parse_args()
print(args)

incl_field_set = set(args.keep_fields + [DOCID_FIELD])

with FileWrapper(args.output, 'w') as fout:
    for ln, old_rec in enumerate(jsonlGen(args.input)):
        if DOCID_FIELD not in old_rec:
            raise Exception(
                f'Entry {ln+1} in args.input lacks the field {DOCID_FIELD}')
        new_rec = {
            k: old_rec[k]
            for k in set(old_rec.keys()).intersection(incl_field_set)
        }
        fout.write(json.dumps(new_rec) + '\n')
Пример #6
0
predicted_queries = []

for doc_id, predicted_queries_partial in tqdm(zip(
        FileWrapper(args.doc_ids_path), FileWrapper(args.predictions_path)),
                                              desc='reading predictions'):
    doc_id = doc_id.strip()
    if doc_id_prev is not None and doc_id_prev != doc_id:
        if predicted_queries and doc_id_prev is not None:
            docid_to_preds[doc_id_prev] = ' '.join(predicted_queries).strip()
        predicted_queries = []

    doc_id_prev = doc_id
    predicted_queries.append(predicted_queries_partial)

# Not forgetting about the last batch
if predicted_queries and doc_id_prev is not None:
    docid_to_preds[doc_id_prev] = ' '.join(predicted_queries)

with FileWrapper(args.output, 'w') as outf:
    for doce in tqdm(jsonlGen(args.input), desc='adding doc2query fields'):
        doc_id = doce[DOCID_FIELD]
        if doc_id in docid_to_preds:
            text, text_unlemm = nlp.procText(docid_to_preds[doc_id])
            doce[TEXT_FIELD_NAME] = doce[TEXT_FIELD_NAME] + ' ' + text
            doce[DOC2QUERY_FIELD_TEXT] = text
            doce[DOC2QUERY_FIELD_TEXT_UNLEMM] = text_unlemm
        else:
            print(f'WARNING: no predictions for {doc_id}')

        outf.write(json.dumps(doce) + '\n')