def process_query_batch(args): """ Given a query, generate query tuples for the math index :param args: :return: nil """ stats = Stats() fileid = os.getpid() query_list, topk, math_index = args math_index.openDB(fileid, topk) stats.num_documents = len(query_list) for (query_num, query_string) in query_list: trees = MathExtractor.parse_from_xml(query_string, query_num, stats.missing_tags, stats.problem_files) stats.num_expressions += len(trees) # also need to handle keyword queries if present terms = re.findall(r"<keyword[^>]*>\s*([^<]*\S)\s*</keyword>", query_string) stats.num_keywords += len(terms) math_index.search(fileid, query_num, trees, terms, topk) math_index.closeDB(fileid) return (fileid, stats)
def read_file(filename, file_id, missing_tags=None, problem_files=None): """ Read file for parsing :type filename: string :param filename: file to be parsed :rtype: list(SymbolTree) :return list of Symbol trees found in the file """ #s = time.time() (ext,content) = MathDocument.read_doc_file(filename) if ext == '.tex': t = MathExtractor.parse_from_tex(content, file_id) #print("file %s took %s"%(file_id,time.time()-s)) return [t] elif ext in {'.xhtml', '.mathml', '.mml', '.html'}: t = MathExtractor.parse_from_xml(content, file_id, missing_tags=missing_tags, problem_files=problem_files) #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t))) return t else: problem_files["unknown_filetype"] = problem_files.get("unknown_filetype", set()) problem_files["unknown_filetype"].add(filename) print('Unknown filetype %s for %s' % (ext, filename)) return []
def read_file(filename, file_id, missing_tags=None, problem_files=None): """ Read file for parsing :type filename: string :param filename: file to be parsed :rtype: list(SymbolTree) :return list of Symbol trees found in the file """ #s = time.time() (ext, content) = MathDocument.read_doc_file(filename) if ext == '.tex': t = MathExtractor.parse_from_tex(content, file_id) #print("file %s took %s"%(file_id,time.time()-s)) return [t] elif ext in {'.xhtml', '.mathml', '.mml', '.html'}: t = MathExtractor.parse_from_xml(content, file_id, missing_tags=missing_tags, problem_files=problem_files) #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t))) return t else: problem_files["unknown_filetype"] = problem_files.get( "unknown_filetype", set()) problem_files["unknown_filetype"].add(filename) print('Unknown filetype %s for %s' % (ext, filename)) return []
def process_query_batch(args): """ Given a query, generate query tuples for the math index :param args: :return: nil """ stats = Stats() fileid = os.getpid() system, db, run_tag, query_list, topk, math_index, strategy = args math_index.openDB(fileid,topk) stats.num_documents = len(query_list) for (query_num,query_string) in query_list: trees = MathExtractor.parse_from_xml(query_string, query_num, stats.missing_tags, stats.problem_files) stats.num_expressions += len(trees) math_index.search(fileid, query_num, trees) # also need to handle keyword queries if present math_index.closeDB(fileid) return (fileid,stats)