def process_query_batch(args): """ Given a query, generate query tuples for the math index :param args: :return: nil """ stats = Stats() fileid = os.getpid() query_list, topk, math_index = args math_index.openDB(fileid, topk) stats.num_documents = len(query_list) for (query_num, query_string) in query_list: trees = MathExtractor.parse_from_xml(query_string, query_num, stats.missing_tags, stats.problem_files) stats.num_expressions += len(trees) # also need to handle keyword queries if present terms = re.findall(r"<keyword[^>]*>\s*([^<]*\S)\s*</keyword>", query_string) stats.num_keywords += len(terms) math_index.search(fileid, query_num, trees, terms, topk) math_index.closeDB(fileid) return (fileid, stats)
def math_indexer_task(pargs) -> (str, list): """ creates index tuples for the expressions in this subcollection :param pargs: :return: (fileid, combined_stats) """ math_index, cntl, chunkid = pargs combined_stats = Stats() docs = MathDocument(cntl) (chunk_size, mappings) = docs.read_mapping_file(chunkid) combined_stats.num_documents += len(mappings) seen_docs = [] # just dump them as they come for (doc_id, filename) in enumerate(mappings,start=chunkid*chunk_size): ## print('parsing %s, id:%s ' % (filename, doc_id),flush=True) try: # get all the symbol trees found in file for tree in read_file(filename, doc_id, missing_tags=combined_stats.missing_tags, problem_files=combined_stats.problem_files): combined_stats.num_expressions += 1 # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module seen_docs.append(tree) except Exception as err: reason = str(err) print("Failed to process document "+filename+": "+reason, file=sys.stderr) combined_stats.problem_files[reason] = combined_stats.problem_files.get(reason, set()) combined_stats.problem_files[reason].add(doc_id) fileid = math_index.add(seen_docs) print("%s is done saving to database %s" % (chunkid,fileid), flush=True) return fileid, combined_stats
def math_indexer_task(pargs) -> (str, list): """ creates index tuples for the expressions in this subcollection :param pargs: :return: (fileid, combined_stats) """ math_index, cntl, chunkid = pargs combined_stats = Stats() docs = MathDocument(cntl) (chunk_size, mappings) = docs.read_mapping_file(chunkid) combined_stats.num_documents += len(mappings) seen_docs = [] # just dump them as they come for (doc_id, filename) in enumerate(mappings, start=chunkid * chunk_size): ## print('parsing %s, id:%s ' % (filename, doc_id),flush=True) try: # get all the symbol trees found in file for tree in read_file(filename, doc_id, missing_tags=combined_stats.missing_tags, problem_files=combined_stats.problem_files): combined_stats.num_expressions += 1 combined_stats.global_expressions += len(tree.position) # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module seen_docs.append(tree) except Exception as err: reason = str(err) print("Failed to process document " + filename + ": " + reason, file=sys.stderr) combined_stats.problem_files[ reason] = combined_stats.problem_files.get(reason, set()) combined_stats.problem_files[reason].add(doc_id) fileid = math_index.add(seen_docs) print("%s is done saving to database %s" % (chunkid, fileid), flush=True) return fileid, combined_stats
def process_query_batch(args): """ Given a query, generate query tuples for the math index :param args: :return: nil """ stats = Stats() fileid = os.getpid() system, db, run_tag, query_list, topk, math_index, strategy = args math_index.openDB(fileid,topk) stats.num_documents = len(query_list) for (query_num,query_string) in query_list: trees = MathExtractor.parse_from_xml(query_string, query_num, stats.missing_tags, stats.problem_files) stats.num_expressions += len(trees) math_index.search(fileid, query_num, trees) # also need to handle keyword queries if present math_index.closeDB(fileid) return (fileid,stats)
math_index = Version03Index(cntl, window=window) ## if cntl.read("results"): ## # try ingesting and processing results (temporary setting) ## tuples = math_index.get(query_file) ## for qid,hit in tuples.items(): ## print(qid,hit) ## else: with open(query_file, encoding='utf-8') as file: parsed = BeautifulSoup(file, "html.parser") query_list = parsed.find_all("topic") print("There are %s queries." % len(query_list)) combined_stats = Stats() fileids = set() ## try: query_list_m = list(map(get_query, query_list)) # whole batch for now args = [(query_list_m, topk, math_index)] for p in args: # single-process execution (fileid, stats) = process_query_batch(p) fileids.add(fileid) combined_stats.add(stats) ## except Exception as err: ## reason = str(err) ## print("Failed to process queries: "+reason, file=sys.stderr) cntl.store("query_fileids", str(fileids))
# try ingesting and processing results (temporary setting) tuples = math_index.get(query_file) for qid,hit in tuples.items(): print(qid,hit) else: topk = ntcir_wiki_count if system == 'Wikipedia' else ntcir_main_count with open(query_file, encoding='utf-8') as file: parsed = BeautifulSoup(file, "lxml") query_list = parsed.find_all("topic") print("There are %s queries." % (len(query_list)), flush=True) combined_stats = Stats() fileids = set() try: query_list_m = list(map(get_query,query_list)) # whole batch for now args = [(system, db, run_tag, query_list_m, topk, math_index, weighting_strategy)] for p in args: # single-process execution (fileid,stats) = process_query_batch(p) fileids.add(fileid) combined_stats.add(stats) except Exception as err: reason = str(err) print("Failed to process document "+filename+": "+reason, file=sys.stderr) combined_stats.problem_files[reason] = combined_stats.problem_files.get(reason, set()) combined_stats.problem_files[reason].add(filename)
if cntl.read("results"): # try ingesting and processing results (temporary setting) tuples = math_index.get(query_file) for qid, hit in tuples.items(): print(qid, hit) else: topk = ntcir_wiki_count if system == 'Wikipedia' else ntcir_main_count with open(query_file, encoding='utf-8') as file: parsed = BeautifulSoup(file) query_list = parsed.find_all("topic") print("There are %s queries." % (len(query_list)), flush=True) combined_stats = Stats() fileids = set() try: query_list_m = list(map(get_query, query_list)) # whole batch for now args = [(system, db, run_tag, query_list_m, topk, math_index, weighting_strategy)] for p in args: # single-process execution (fileid, stats) = process_query_batch(p) fileids.add(fileid) combined_stats.add(stats) except Exception as err: reason = str(err) print("Failed to process document " + filename + ": " + reason,
row = "-" with open(doc_id_mapping_path, newline='', encoding='utf-8') as mapping_file: while True: if num_docs % chunk_size == 0: filepos.append(mapping_file.tell()) num_docs += 1 row = mapping_file.readline() if row == "": num_docs -= 1 if num_docs % chunk_size == 0: del filepos[-1] break cntl.store("file_skips",str(filepos)) print("There are " + str(num_docs) + " documents to index", flush=True) combined_stats = Stats() if num_docs > 0: math_index = Version03Index(db=database_name, window=window) max_jobs = min(10,num_docs) manager = multiprocessing.Manager() lock = manager.Lock() #identify chunks to be indexed by each process args = [(math_index, cntl, chunkid) for chunkid in list(range(len(filepos)))] fileids = set() ## for p in args: # single-process execution, for debugging ## fileid, stats = math_indexer_task(p)