#!/usr/bin/env python import sys import argparse sys.path.append('.') from scripts.data_convert.convert_common import jsonlGen parser = argparse.ArgumentParser( description='Count tokens and number of entries in JSONL') parser.add_argument('--input', type=str, required=True) parser.add_argument('--field', type=str, required=True) args = parser.parse_args() qty = 0 tok_qty = 0 field = args.field for e in jsonlGen(args.input): qty += 1 if field in e: tok_qty += len(e[field].split()) print(qty, tok_qty)
required=True) parser.add_argument('--input', metavar='input files', help='input JSONL files (possibly compressed)', type=str, nargs='+', required=True) parser.add_argument('--output', metavar='output file', help='output file', type=str, required=True) args = parser.parse_args() print(args) vocab = VocabBuilder() field = args.field_name for fn in args.input: ln = 0 for docEntry in tqdm(jsonlGen(fn), desc='Processing: ' + fn): ln += 1 if field in docEntry: vocab.procDoc(docEntry[field]) else: print(f'No field {field} is found in line {ln} file {fn}') sys.exit(1) vocab.save(args.output)
type=float, help=f'a probability to sample non-relevant document entries', required=True) args = parser.parse_args() sample_prob = args.nonrel_sample_prob if sample_prob < 0 or sample_prob >= 1: print('Sampling probability must be >=0 and < 1') sys.exit(1) qrelDict = readQrelsDict(os.path.join(args.qrel_dir, QREL_FILE)) allRelDocs = set() for qid, qd in qrelDict.items(): for did, rel in qd.items(): if rel >= args.min_rel_grade: allRelDocs.add(did) with FileWrapper(args.out_doc_file, 'w') as outFile: for docEntry in jsonlGen(args.inp_doc_file): did = docEntry[DOCID_FIELD] if did in allRelDocs or random.random() < sample_prob: outFile.write(json.dumps(docEntry) + '\n')
apath1 = os.path.join(dataDir, args.input_subdir1, ANSWER_FILE_JSON) apath2 = os.path.join(dataDir, args.input_subdir2, ANSWER_FILE_JSON) rpath1 = os.path.join(dataDir, args.input_subdir1, QREL_FILE) qrelDict1 = readQrelsDict(rpath1) print('Read %d qrel sets from %s' % (len(qrelDict1), rpath1)) rpath2 = os.path.join(dataDir, args.input_subdir2, QREL_FILE) qrelDict2 = readQrelsDict(rpath2) print('Read %d qrel sets from %s' % (len(qrelDict2), rpath2)) answDictText = {} for fn in [apath1, apath2]: qty = 0 for e in tqdm(jsonlGen(fn), desc='loading answers'): qty += 1 answId = e[DOCID_FIELD] answText = e[TEXT_RAW_FIELD_NAME] answDictText[answId] = answText print('Read %d answers from %s' % (qty, fn)) if args.use_hnsw: methodName = 'hnsw' M = 30 efC = 200 indexTimeParams = {
metavar='input file', help='input JSONL file (can be gz or bz2 compressed)') parser.add_argument('--output', type=str, required=True, metavar='output file', help='output JSONL file (can be gz or bz2 compressed)') parser.add_argument( '--keep_fields', nargs='+', metavar='included fields', required=True, help= f'A list of fields to include, note that {DOCID_FIELD} is not filtered out.' ) args = parser.parse_args() print(args) incl_field_set = set(args.keep_fields + [DOCID_FIELD]) with FileWrapper(args.output, 'w') as fout: for ln, old_rec in enumerate(jsonlGen(args.input)): if DOCID_FIELD not in old_rec: raise Exception( f'Entry {ln+1} in args.input lacks the field {DOCID_FIELD}') new_rec = { k: old_rec[k] for k in set(old_rec.keys()).intersection(incl_field_set) } fout.write(json.dumps(new_rec) + '\n')
predicted_queries = [] for doc_id, predicted_queries_partial in tqdm(zip( FileWrapper(args.doc_ids_path), FileWrapper(args.predictions_path)), desc='reading predictions'): doc_id = doc_id.strip() if doc_id_prev is not None and doc_id_prev != doc_id: if predicted_queries and doc_id_prev is not None: docid_to_preds[doc_id_prev] = ' '.join(predicted_queries).strip() predicted_queries = [] doc_id_prev = doc_id predicted_queries.append(predicted_queries_partial) # Not forgetting about the last batch if predicted_queries and doc_id_prev is not None: docid_to_preds[doc_id_prev] = ' '.join(predicted_queries) with FileWrapper(args.output, 'w') as outf: for doce in tqdm(jsonlGen(args.input), desc='adding doc2query fields'): doc_id = doce[DOCID_FIELD] if doc_id in docid_to_preds: text, text_unlemm = nlp.procText(docid_to_preds[doc_id]) doce[TEXT_FIELD_NAME] = doce[TEXT_FIELD_NAME] + ' ' + text doce[DOC2QUERY_FIELD_TEXT] = text doce[DOC2QUERY_FIELD_TEXT_UNLEMM] = text_unlemm else: print(f'WARNING: no predictions for {doc_id}') outf.write(json.dumps(doce) + '\n')