def readSampleQueries(dataDir, input_subdir1, sample_prob1, input_subdir2, sample_prob2): qpath1 = os.path.join(dataDir, input_subdir1, QUESTION_FILE_JSON) fullQueryList1 = readQueries(qpath1) sampleQueryList1 = np.random.choice(fullQueryList1, size=int(len(fullQueryList1) * sample_prob1), replace=False) print('Read %d queries from %s sampled %d' % (len(fullQueryList1), qpath1, len(sampleQueryList1))) qpath2 = os.path.join(dataDir, input_subdir2, QUESTION_FILE_JSON) fullQueryList2 = readQueries(qpath2) sampleQueryList2 = np.random.choice(fullQueryList2, size=int(len(fullQueryList2) * sample_prob2), replace=False) print('Read %d queries from %s sampled %d' % (len(fullQueryList2), qpath2, len(sampleQueryList2))) return sampleQueryList1, sampleQueryList2
def main(): args = parse_args() print(args.raw_args) print("Start reading input files...") src_dir = args.src_dir queries = readQueries(os.path.join(src_dir, QUESTION_FILE_JSON)) queries_ids = [data[DOCID_FIELD] for data in queries] assert len(queries_ids) == len(set(queries_ids)), "Non-unique queries ids are forbidden!" qrels = readQrels(os.path.join(src_dir, QREL_FILE)) print("Done reading input files.") random.seed(args.seed) random.shuffle(queries_ids) sizes = args.partitions_sizes(len(queries_ids)) assert len(sizes) == len(args.partitions_names) print("Final partitions sizes:", list(zip(args.partitions_names, sizes))) query_id_to_partition = build_query_id_to_partition(queries_ids, sizes) write_queries_files(queries, query_id_to_partition, args.dst_dir, args.partitions_names) write_qrels_files(qrels, query_id_to_partition, args.dst_dir, args.partitions_names)
metavar='1st output data subirectory', help='1st output data subdirectory', type=str, required=True) parser.add_argument('--out_subdir2', metavar='2d output data subirectory', help='2d output data subdirectory', type=str, required=True) args = parser.parse_args() print(args) dataDir = args.data_dir fullQueryList = readQueries( os.path.join(dataDir, args.input_subdir, QUESTION_FILE_JSON)) fullQueryIdSet = set([data[DOCID_FIELD] for data in fullQueryList]) print('Read all the queries from the main dir') qrelList = readQrels(os.path.join(dataDir, args.input_subdir, QREL_FILE)) print('Read all the QRELs from the main dir') queryIdSet = set() partSubDirs = [args.out_subdir1, args.out_subdir2] for part in range(0, 2): outDir = os.path.join(dataDir, partSubDirs[part]) qrelList = readQrels(os.path.join(outDir, QREL_FILE))
default=None, type=str) parser.add_argument('--fwd_index_file', metavar='forward index catalog file', help='the "catalog" file of the forward index', type=str, required=True) parser.add_argument('--min_exp_doc_qty', metavar='min # of docs per query to expect', help='min # of docs per query to expect', type=int, required=True) args = parser.parse_args() print('Reading document IDs from the index') allDocIds = readDocIdsFromForwardFileHeader(args.fwd_index_file) print('Reading queries') queries = readQueries(args.query_file) query_ids = [] query_doc_qtys = {} for e in queries: qid = e[DOCID_FIELD] query_ids.append(qid) # Some copy-paste from common_eval.readRunDict, but ok for now fileName = args.run_file with FileWrapper(fileName) as f: prevQueryId = None # Check for repeating document IDs and improperly sorted entries for ln, line in enumerate(f):
type=str, required=True) parser.add_argument('--min_query_token_qty', type=int, default=0, metavar='min # of query tokens', help='ignore queries that have smaller # of tokens') parser.add_argument('--' + BERT_TOK_OPT, action='store_true', help=BERT_TOK_OPT_HELP) args = parser.parse_args() print(args) arg_vars = vars(args) inpFile = FileWrapper(args.input) ignoreQueries = set() for qfile_dir in args.filter_query_dir: qfile_name = os.path.join(qfile_dir, QUESTION_FILE_JSON) for e in readQueries(qfile_name): ignoreQueries.add(e[TEXT_FIELD_NAME]) print('Read queries from: ' + qfile_name) print('A list of queries to ignore has %d entries' % (len(ignoreQueries))) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) outFileQueries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON), 'w') outFileQrelsName = os.path.join(args.out_dir, QREL_FILE) minQueryTokQty = args.min_query_token_qty stopWords = readStopWords(STOPWORD_FILE, lowerCase=True)
type=float, default=1.0) parser.add_argument('-k', metavar='k-NN k', type=int, default=1) parser.add_argument('--min_jacc', metavar='min jaccard to compare answers', type=float, default=1.0) parser.add_argument("--use_hnsw", action="store_true") args = parser.parse_args() print(args) dataDir = args.data_dir qpath1 = os.path.join(dataDir, args.input_subdir1, QUESTION_FILE_JSON) fullQueryList1 = readQueries(qpath1) sampleQueryList1 = np.random.choice( fullQueryList1, size=int(len(fullQueryList1) * args.sample_prob1), replace=False) print('Read %d queries from %s sampled %d' % (len(fullQueryList1), qpath1, len(sampleQueryList1))) qpath2 = os.path.join(dataDir, args.input_subdir2, QUESTION_FILE_JSON) fullQueryList2 = readQueries(qpath2) sampleQueryList2 = np.random.choice( fullQueryList2, size=int(len(fullQueryList2) * args.sample_prob2), replace=False) print('Read %d queries from %s sampled %d' % (len(fullQueryList2), qpath2, len(sampleQueryList2)))
parser.add_argument('--data_dir', metavar='data directory', help='data directory', type=str, required=True) parser.add_argument('--out_dir', metavar='output directory', help='output directory', type=str, required=True) args = parser.parse_args() print(args) out_dir = args.out_dir if not os.path.exists(out_dir): os.makedirs(out_dir) for subDir in os.listdir(args.data_dir): qf = os.path.join(args.data_dir, subDir, QUESTION_FILE_JSON) if os.path.exists(qf): print('Reading:', qf) res = [] for e in readQueries(qf): res.append(e[DOCID_FIELD]) print('Read', len(res), 'queries') np.save(os.path.join(out_dir, subDir + '.npy'), np.array(res))
parser.add_argument('--filter_query_dir', metavar='filtering query dir', default=[], help=f'all queries found in {QUESTION_FILE_JSON} files from these directories are ignored', nargs='*') parser.add_argument('--out_dir', metavar='output directory', help='output directory', type=str, required=True) args = parser.parse_args() print(args) arg_vars = vars(args) ignoreQueries = set() for qfile_dir in args.filter_query_dir: qfile_name = os.path.join(qfile_dir, QUESTION_FILE_JSON) for e in readQueries(qfile_name): if not TEXT_FIELD_NAME in e: continue ignoreQueries.add(e[TEXT_FIELD_NAME]) print('Read queries from: ' + qfile_name) print('A list of queries to ignore has %d entries' % (len(ignoreQueries))) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) outFileQueries = FileWrapper(os.path.join(args.out_dir, QUESTION_FILE_JSON), 'w') readQty = 0 wroteQty = 0