def find_formula_ids(tsv_results, control_filename): control = Control(control_filename) document_finder = MathDocument(control) for query_offset in tsv_results: print("Processing Query: " + str(query_offset)) total_locs = len(tsv_results[query_offset]["results"]) for index, result in enumerate(tsv_results[query_offset]["results"]): doc, loc = result mathml = document_finder.find_mathml(doc, loc) elem_content = io.StringIO(mathml) # treat the string as if a file root = xml.etree.ElementTree.parse(elem_content).getroot() if "id" in root.attrib: math_id = root.attrib["id"] else: print("ERROR: No formula id found for Query " + str(query_offset) + ", doc = " + str(doc) + ", loc = " + str(loc)) math_id = "math.error" #print(str((query_offset, doc, loc, math_id))) tsv_results[query_offset]["math_ids"].append(math_id) if index > 0 and (index + 1) % 25 == 0: print("... done " + str(index + 1) + " of " + str(total_locs))
def get(self, doc_id, location, expression): if not doc_id in self.cached_locations: self.cached_locations[doc_id] = {} if location in self.cached_locations[doc_id]: return self.cached_locations[doc_id][location] else: #first time the expression is seen, check.... if expression in self.cached_expressions: #expression has been retrieved before but at different location... prev_doc_id, prev_location = self.cached_expressions[expression] return self.cached_locations[prev_doc_id][prev_location] else: control = Control(self.control_filename) # control file name (after indexing) document_finder = MathDocument(control) mathml = document_finder.find_mathml(doc_id, location) mathml = MathExtractor.isolate_pmml(mathml) if isinstance(mathml, bytes): mathml = mathml.decode('UTF-8') # save on cache... self.cached_locations[doc_id][location] = mathml self.cached_expressions[expression] = (doc_id, location) return mathml
def get(self, doc_id, location, expression, force_update=False): if not doc_id in self.cached_locations: self.cached_locations[doc_id] = {} if location in self.cached_locations[doc_id] and not force_update: return self.cached_locations[doc_id][location] else: #first time the expression is seen, check.... if expression in self.cached_expressions and not force_update: #expression has been retrieved before but at different location... prev_doc_id, prev_location = self.cached_expressions[ expression] return self.cached_locations[prev_doc_id][prev_location] else: control = Control(self.control_filename ) # control file name (after indexing) document_finder = MathDocument(control) mathml = document_finder.find_mathml(doc_id, location) mathml = MathExtractor.isolate_pmml(mathml) if isinstance(mathml, bytes): mathml = mathml.decode('UTF-8') # save on cache... self.cached_locations[doc_id][location] = mathml self.cached_expressions[expression] = (doc_id, location) return mathml
def math_indexer_task(pargs) -> (str, list): """ creates index tuples for the expressions in this subcollection :param pargs: :return: (fileid, combined_stats) """ math_index, cntl, chunkid = pargs combined_stats = Stats() docs = MathDocument(cntl) (chunk_size, mappings) = docs.read_mapping_file(chunkid) combined_stats.num_documents += len(mappings) seen_docs = [] # just dump them as they come for (doc_id, filename) in enumerate(mappings,start=chunkid*chunk_size): ## print('parsing %s, id:%s ' % (filename, doc_id),flush=True) try: # get all the symbol trees found in file for tree in read_file(filename, doc_id, missing_tags=combined_stats.missing_tags, problem_files=combined_stats.problem_files): combined_stats.num_expressions += 1 # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module seen_docs.append(tree) except Exception as err: reason = str(err) print("Failed to process document "+filename+": "+reason, file=sys.stderr) combined_stats.problem_files[reason] = combined_stats.problem_files.get(reason, set()) combined_stats.problem_files[reason].add(doc_id) fileid = math_index.add(seen_docs) print("%s is done saving to database %s" % (chunkid,fileid), flush=True) return fileid, combined_stats
def read_file(filename, file_id, missing_tags=None, problem_files=None): """ Read file for parsing :type filename: string :param filename: file to be parsed :rtype: list(SymbolTree) :return list of Symbol trees found in the file """ #s = time.time() (ext, content) = MathDocument.read_doc_file(filename) if ext == '.tex': t = MathExtractor.parse_from_tex(content, file_id) #print("file %s took %s"%(file_id,time.time()-s)) return [t] elif ext in {'.xhtml', '.mathml', '.mml', '.html'}: t = MathExtractor.parse_from_xml(content, file_id, missing_tags=missing_tags, problem_files=problem_files) #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t))) return t else: problem_files["unknown_filetype"] = problem_files.get( "unknown_filetype", set()) problem_files["unknown_filetype"].add(filename) print('Unknown filetype %s for %s' % (ext, filename)) return []
def read_file(filename, file_id, missing_tags=None, problem_files=None): """ Read file for parsing :type filename: string :param filename: file to be parsed :rtype: list(SymbolTree) :return list of Symbol trees found in the file """ #s = time.time() (ext,content) = MathDocument.read_doc_file(filename) if ext == '.tex': t = MathExtractor.parse_from_tex(content, file_id) #print("file %s took %s"%(file_id,time.time()-s)) return [t] elif ext in {'.xhtml', '.mathml', '.mml', '.html'}: t = MathExtractor.parse_from_xml(content, file_id, missing_tags=missing_tags, problem_files=problem_files) #print("file %s took %s per expr"%(file_id,(time.time()-s)/len(t))) return t else: problem_files["unknown_filetype"] = problem_files.get("unknown_filetype", set()) problem_files["unknown_filetype"].add(filename) print('Unknown filetype %s for %s' % (ext, filename)) return []
def math_indexer_task(pargs) -> (str, list): """ creates index tuples for the expressions in this subcollection :param pargs: :return: (fileid, combined_stats) """ math_index, cntl, chunkid = pargs combined_stats = Stats() docs = MathDocument(cntl) (chunk_size, mappings) = docs.read_mapping_file(chunkid) combined_stats.num_documents += len(mappings) seen_docs = [] # just dump them as they come for (doc_id, filename) in enumerate(mappings, start=chunkid * chunk_size): ## print('parsing %s, id:%s ' % (filename, doc_id),flush=True) try: # get all the symbol trees found in file for tree in read_file(filename, doc_id, missing_tags=combined_stats.missing_tags, problem_files=combined_stats.problem_files): combined_stats.num_expressions += 1 combined_stats.global_expressions += len(tree.position) # pairs = tree.get_pairs(window) do not store pairs -- will be created in C++ module seen_docs.append(tree) except Exception as err: reason = str(err) print("Failed to process document " + filename + ": " + reason, file=sys.stderr) combined_stats.problem_files[ reason] = combined_stats.problem_files.get(reason, set()) combined_stats.problem_files[reason].add(doc_id) fileid = math_index.add(seen_docs) print("%s is done saving to database %s" % (chunkid, fileid), flush=True) return fileid, combined_stats
def find_formula_ids(tsv_results, control_filename, mathids_cache): control = Control(control_filename) document_finder = MathDocument(control) for query_offset in tsv_results: print("Processing Query: " + str(query_offset)) total_locs = len(tsv_results[query_offset]["results"]) for index, result in enumerate(tsv_results[query_offset]["results"]): doc, loc = result math_id = mathids_cache.get_mathid(document_finder, doc, loc) #print(str((query_offset, doc, loc, math_id))) tsv_results[query_offset]["math_ids"].append(math_id) if index > 0 and (index + 1) % 25 == 0: print("... done " + str(index + 1) + " of " + str(total_locs), end="\r")
def main(): if len(sys.argv) < 5: print("Usage") print( "\tpython3 rerank_results.py control input_results metric output_results" ) print("") print("Where:") print("\tcontrol:\tPath to tangent control file") print("\tinput_results:\tPath to file with results to re-rank") print("\tmetric:\t\tSimilarity metric to use [0-4]") print( "\toutput_results:\tPath to file where re-ranked results will be stored" ) print("") print("Optional:") print("\t-w\twindow\t\t: Window for pair generation") print("\t-h\thtml_prefix\t: Prefix for HTML output (requires dot)") print("\t-c\tcondition\t: Current test condition") print("\t-s\tstats\t\t: File to store stats") print("\t-t\ttimes\t\t: File to accumulate time stats") print("\t-k\tmax_results\t: K number of results to rerank as maximum") return control_filename = sys.argv[1] input_filename = sys.argv[2] try: metric = int(sys.argv[3]) if metric < -1 or metric > 11: print("Invalid similarity metric function") return except: print("Invalid similarity metric function") return output_filename = sys.argv[4] optional_params = optional_parameters(sys.argv[5:]) #load control file control = Control(control_filename) # control file name (after indexing) math_doc = MathDocument(control) if "w" in optional_params: try: window = int(optional_params["w"]) if window <= 0: print("Invalid window") return except: print("Invalid value for window") return else: window = int(control.read("window")) if "h" in optional_params: html_prefix = optional_params["h"] if not os.path.isdir(html_prefix): os.makedirs(html_prefix) else: html_prefix = None if "c" in optional_params: condition = optional_params["c"] print("testing condition: " + condition) else: condition = "undefined" if "s" in optional_params: stats_file = optional_params["s"] else: stats_file = None if "k" in optional_params: try: max_k = int(optional_params["k"]) except: print("Invalid max_results parameter") return else: max_k = 0 if "t" in optional_params: times_file = optional_params["t"] else: times_file = None in_file = open(input_filename, 'r', newline='', encoding='utf-8') reader = csv.reader(in_file, delimiter='\t', lineterminator='\n', quoting=csv.QUOTE_NONE, escapechar="\\") lines = [row for row in reader] in_file.close() mathml_cache_file = control_filename + ".retrieval_2.cache" if not os.path.exists(mathml_cache_file): mathml_cache = MathMLCache(control_filename) else: cache_file = open(mathml_cache_file, "rb") mathml_cache = pickle.load(cache_file) cache_file.close() current_query = None current_name = None current_tuple_retrieval_time = 'undefined' all_queries = [] #read all results to re-rank for idx, line in enumerate(lines): #parts = line.strip().split("\t") parts = line if len(parts) == 2: if parts[0][0] == "Q": current_name = parts[1] current_query = None elif parts[0][0] == "E": if current_name is None: print("invalid expression at " + str(idx) + ": query name expected first") else: query_expression = parts[1] #query_offset = len(all_queries) query_offset = int(current_name.split("-")[-1]) - 1 if html_prefix != None: mathml = mathml_cache.get(-1, query_offset, query_expression, True) # create empty directories for this query ... if not os.path.isdir(html_prefix + "/" + current_name): os.makedirs(html_prefix + "/" + current_name) if not os.path.isdir(html_prefix + "/" + current_name + "/images"): os.makedirs(html_prefix + "/" + current_name + "/images") else: mathml = None current_query = Query(current_name, query_expression, mathml, current_tuple_retrieval_time, max_k) current_name = None all_queries.append(current_query) print("Query: " + current_query.name + ": " + current_query.expression) #print(mathml) #current_query.tree.save_as_dot("expre_" + str(idx) + ".gv") elif parts[0][0] == "C": if current_query is None: print("invalid constraint at " + str(idx) + ": query expression expected first") else: # create a constraint tree current_query.set_constraints(parts[1]) # RZ: Record tuple-based retrieval time and other metrics. if len(parts) == 3 and parts[0][0] == "I" and current_query != None: if parts[1] == "qt": current_query.initRetrievalTime = float(parts[2]) elif parts[1] == "post": current_query.postings = int(parts[2]) elif parts[1] == "expr": current_query.matchedFormulae = int(parts[2]) elif parts[1] == "doc": current_query.matchedDocs = int(parts[2]) if len(parts) == 5: if parts[0][0] == "R": doc_id = int(parts[1]) location = int(parts[2]) doc_name = math_doc.find_doc_file(doc_id) expression = parts[3] score = float(parts[4]) if html_prefix != None: mathml = mathml_cache.get(doc_id, location, expression) else: mathml = None if current_query is None: print("Error: result listed before a query, line " + str(idx)) else: current_query.add_result(doc_id, doc_name, location, expression, score, mathml) cache_file = open(mathml_cache_file, "wb") pickle.dump(mathml_cache, cache_file, pickle.HIGHEST_PROTOCOL) cache_file.close() # now, re-rank... print("Results loaded, reranking ...") # compute similarity first... start_time = time.time() for q_idx, query in enumerate(all_queries): #print("Evaluating: " + query.name + " - " + query.expression) query_start_time = time.time() * 1000 # RZ: ms for res_idx, exp_result in enumerate(query.results): result = query.results[exp_result] #print("Candidate: " + result.expression) scores = [0.0] if metric == -1: # bypass mode, generate HTML for original core ranking scores = [result.original_score] matched_c = {} elif metric == 0: # same as original based on f-measure of matched pairs.. pairs_query = query.tree.root.get_pairs("", window) pairs_candidate = result.tree.root.get_pairs("", window) scores, matched_q, matched_c = similarity_v00( pairs_query, pairs_candidate) elif metric == 1: # based on testing of alignments.... scores, matched_q, matched_c = similarity_v01( query.tree, result.tree) elif metric == 2: # Same as 0 but limiting to matching total symbols first... pairs_query = query.tree.root.get_pairs("", window) pairs_candidate = result.tree.root.get_pairs("", window) scores, matched_q, matched_c = similarity_v02( pairs_query, pairs_candidate) elif metric == 3: # modified version of 2 which performs unification.... pairs_candidate = result.tree.root.get_pairs("", window) scores, matched_q, matched_c, unified_c = similarity_v03( pairs_query, pairs_candidate) result.set_unified_elements(unified_c) elif metric == 4: # modified version of 1 which performs unification ... sim_res = similarity_v04(query.tree, result.tree, query.constraints) scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res result.set_unified_elements(unified_c) result.set_wildcard_matches(wildcard_c) result.set_all_unified(unified) elif metric == 5: # modified version of 4 which allows multiple sub matches sim_res = similarity_v05(query.tree, result.tree, query.constraints) scores, matched_q, matched_c, unified_c, wildcard_c = sim_res result.set_unified_elements(unified_c) result.set_wildcard_matches(wildcard_c) elif metric == 6: # modified version of 4 which allows subtree matches for wildcards (partial support)... sim_res = similarity_v06(query.tree, result.tree, query.constraints) scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res result.set_unified_elements(unified_c) result.set_wildcard_matches(wildcard_c) result.set_all_unified(unified) elif metric == 7: # modified version of 4 which allows subtree matches for wildcards (partial support)... sim_res = similarity_v07(query.tree, result.tree, query.constraints) scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res result.set_unified_elements(unified_c) result.set_wildcard_matches(wildcard_c) result.set_all_unified(unified) elif metric == 8: # modified version of 4 which allows subtree matches for wildcards (partial support)... sim_res = similarity_v08(query.tree, result.tree, query.constraints) scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res result.set_unified_elements(unified_c) result.set_wildcard_matches(wildcard_c) result.set_all_unified(unified) elif metric == 9: # modified version of 4 which allows subtree matches for wildcards (partial support)... sim_res = similarity_v09(query.tree, result.tree, query.constraints) scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res result.set_unified_elements(unified_c) result.set_wildcard_matches(wildcard_c) result.set_all_unified(unified) elif metric == 10: # modified version of 4 which allows subtree matches for wildcards (partial support)... sim_res = similarity_v10(query.tree, result.tree, query.constraints) scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res result.set_unified_elements(unified_c) result.set_wildcard_matches(wildcard_c) result.set_all_unified(unified) elif metric == 11: # matching of metric 06 with scores from metric 04 (MSS) sim_res = similarity_v11(query.tree, result.tree, query.constraints) scores, matched_q, matched_c, unified_c, wildcard_c, unified = sim_res result.set_unified_elements(unified_c) result.set_wildcard_matches(wildcard_c) result.set_all_unified(unified) result.set_matched_elements(matched_c) result.new_scores = scores query_end_time = time.time() * 1000 # RZ: ms # re-rank based on new score(s) query.sort_results() query.sort_documents() query.elapsed_time = query_end_time - query_start_time end_time = time.time() elapsed = end_time - start_time print("Elapsed Time Ranking: " + str(elapsed) + "s") #now, store the re-ranked results... out_file = open(output_filename, 'w', newline='', encoding='utf-8') csv_writer = csv.writer(out_file, delimiter='\t', lineterminator='\n', quoting=csv.QUOTE_NONE, escapechar="\\") for query in all_queries: csv_writer.writerow([]) query.output_query(csv_writer) query.output_sorted_results(csv_writer) if html_prefix is not None: print("Saving " + query.name + " to HTML file.....") query.save_html(html_prefix + "/" + query.name) out_file.close() #if stats file is requested ... if stats_file is not None: out_file = open(stats_file, "w") out_file.write(Query.stats_header("\t")) for query in all_queries: query.output_stats(out_file, "\t", condition) out_file.close() # if times file is requested ... if times_file is not None: sorted_queries = sorted([(query.name.strip(), query) for query in all_queries]) if os.path.exists(times_file): out_file = open(times_file, "a") else: out_file = open(times_file, "w") header = "condition," + ",".join( [name for (name, query) in sorted_queries]) out_file.write(header + "\n") line = condition for name, query in sorted_queries: line += "," + str(query.elapsed_time) out_file.write(line + "\n") out_file.close() print("Finished successfully")
def output_query(self, out_file, cntl, topk, query_time_ms): out_file.write("\n") out_file.write("QUERY\t" + self.name + "\t" + str(query_time_ms) + "\n") if len(self.sorted_docs) == 0: # no results? nothing can be output return """ for mquery in self.mqueries: out_file.write("E\t" + mquery.expression + "\n") if self.tquery: for keyword in self.tquery.keywords: out_file.write("P\t" + keyword + "\n") """ d = MathDocument(cntl) min_score = self.sorted_docs[len(self.sorted_docs) - 1].final_score if len(self.sorted_docs) < topk: print("Warning: Query produced less than " + str(topk) + " documents. Results will be repeated", flush=True) # force output topk results for idx in range(topk): doc = self.sorted_docs[idx % len(self.sorted_docs)] positions = self.get_math_pos_with_score(doc) try: exprids = list( map( lambda pos: (d.find_mathml_id(doc.doc_id, pos[0]), pos[1]), positions)) except (IOError): # cannot read ids exprids = positions #out_file.write("R\t" + str(doc.doc_name) + "\t" + str(doc.final_score) + "\t(at: " + str(exprids) + str(self.get_text_pos(doc))+ ")\n") row_elements = [str(idx + 1)] if idx < len(self.sorted_docs): # use original score doc_score = doc.final_score else: doc_score = min_score if isinstance(doc_score, list): row_elements.append(str(doc_score[0])) else: row_elements.append(str(doc_score)) row_elements.append(doc.doc_name) # add formulas *M* for exprid, mscore in exprids: row_elements += ["*M*", str(exprid), str(mscore[0])] # add Keywords *W* if self.tquery: for keyword in self.tquery.keywords: row_elements += ["*W*", keyword, str(doc.tscore[1])] out_file.write("\t".join(row_elements) + "\n")
__author__ = 'FWTompa' if __name__ == '__main__': if sys.stdout.encoding != 'utf8': sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer, 'strict') if sys.stderr.encoding != 'utf8': sys.stderr = codecs.getwriter('utf8')(sys.stderr.buffer, 'strict') if len(argv) != 4 or argv[1] == "help": print("Use: python docids2doclist.py <cntl> <doc#s> <filelist>") print( " where doc#s is a file in which each line is a set of docids" ) print(" such as {23145, 31242, 125}") sys.exit() cntl = Control(argv[1]) # control file name (after indexing) md = MathDocument(cntl) doclist = [] with open(argv[2], 'r', encoding='utf-8') as fin: while True: t = fin.readline() if t == "": break doclist.extend(t.strip("{} \n").split(", ")) with open(argv[3], 'w', encoding='utf-8') as fout: for val in doclist: fout.write(md.find_doc_file(int(val)) + "\n") print("Created list of %d file names" % len(doclist))
def main(): if len(sys.argv) < 5: print("Usage") print("\tpython3 rerank_results.py control input_results metric output_results") print("") print("Where:") print("\tcontrol:\tPath to tangent control file") print("\tinput_results:\tPath to file with results to re-rank") print("\tmetric:\t\tSimilarity metric to use [0-4]") print("\toutput_results:\tPath to file where re-ranked results will be stored") print("") print("Optional:") print("\t-w\twindow\t\t: Window for pair generation") print("\t-h\thtml_prefix\t: Prefix for HTML output (requires dot)") print("\t-c\tcondition\t: Current test condition") print("\t-s\tstats\t\t: File to store stats") print("\t-t\ttimes\t\t: File to accumulate time stats") return control_filename = sys.argv[1] input_filename = sys.argv[2] try: metric = int(sys.argv[3]) if metric < 0 or metric > 5: print("Invalid similarity metric function") return except: print("Invalid similarity metric function") return output_filename = sys.argv[4] optional_params = optional_parameters(sys.argv[5:]) #load control file control = Control(control_filename) # control file name (after indexing) math_doc = MathDocument(control) if "w" in optional_params: try: window = int(optional_params["w"]) if window <= 0: print("Invalid window") return except: print("Invalid value for window") return else: window = int(control.read("window")) if "h" in optional_params: html_prefix = optional_params["h"] if not os.path.isdir(html_prefix): os.makedirs(html_prefix) if not os.path.isdir(html_prefix + "/images"): os.makedirs(html_prefix + "/images") else: html_prefix = None if "c" in optional_params: condition = optional_params["c"] print("testing condition: " + condition) else: condition = "undefined" if "s" in optional_params: stats_file = optional_params["s"] else: stats_file = None if "t" in optional_params: times_file = optional_params["t"] else: times_file = None in_file = open(input_filename, 'r', encoding="utf-8") lines = in_file.readlines() in_file.close() mathml_cache_file = control_filename + ".retrieval_2.cache" if not os.path.exists(mathml_cache_file): mathml_cache = MathMLCache(control_filename) else: cache_file = open(mathml_cache_file, "rb") mathml_cache = pickle.load(cache_file) cache_file.close() current_query = None current_name = None current_tuple_retrieval_time = 'undefined' all_queries = [] #read all results to re-rank for idx, line in enumerate(lines): parts = line.strip().split("\t") if len(parts) == 2: if parts[0][0] == "Q": current_name = parts[1] current_query = None elif parts[0][0] == "E": if current_name is None: print("invalid expression at " + str(idx) + ": query name expected first") else: query_expression = parts[1] if html_prefix != None: mathml = mathml_cache.get(-1, len(all_queries), query_expression) else: mathml = None current_query = Query(current_name, query_expression, mathml, current_tuple_retrieval_time) current_name = None all_queries.append(current_query) print("Query: " + current_query.name + ": " + current_query.expression, flush=True) #print(mathml) #current_query.tree.save_as_dot("expre_" + str(idx) + ".gv") elif parts[0][0] == "C": if current_query is None: print("invalid constraint at " + str(idx) + ": query expression expected first") else: # create a constraint tree current_query.set_constraints(parts[1]) # RZ: Record tuple-based retrieval time and other metrics. if len(parts) == 3 and parts[0][0] == "I" and current_query != None: if parts[1] == "qt": current_query.initRetrievalTime = float( parts[2] ) elif parts[1] == "post": current_query.postings = int( parts[2] ) elif parts[1] == "expr": current_query.matchedFormulae = int( parts[2] ) elif parts[1] == "doc": current_query.matchedDocs = int( parts[2] ) if len(parts) == 5: if parts[0][0] == "R": doc_id = int(parts[1]) location = int(parts[2]) doc_name = math_doc.find_doc_file(doc_id) expression = parts[3] score = float(parts[4]) if html_prefix != None: mathml = mathml_cache.get(doc_id, location, expression) else: mathml = None if current_query is None: print("Error: result listed before a query, line " + str(idx)) else: current_query.add_result(doc_id, doc_name, location, expression, score, mathml) cache_file = open(mathml_cache_file, "wb") pickle.dump(mathml_cache, cache_file, pickle.HIGHEST_PROTOCOL) cache_file.close() # now, re-rank... # compute similarity first... start_time = time.time() for q_idx, query in enumerate(all_queries): pairs_query = query.tree.root.get_pairs("", window) #print("Evaluating: " + query.expression) query_start_time = time.time() * 1000 # RZ: ms for res_idx, exp_result in enumerate(query.results): result = query.results[exp_result] #print("Candidate: " + result.expression) scores = [0.0] if metric == 0: # same as original based on f-measure of matched pairs... pairs_candidate = result.tree.root.get_pairs("", window) scores, matched_q, matched_c = similarity_v00(pairs_query, pairs_candidate) elif metric == 1: # based on testing of alignments.... scores, matched_q, matched_c = similarity_v01(query.tree, result.tree) elif metric == 2: # Same as 0 but limiting to matching total symbols first... pairs_candidate = result.tree.root.get_pairs("", window) scores, matched_q, matched_c = similarity_v02(pairs_query, pairs_candidate) elif metric == 3: # modified version of 2 which performs unification.... pairs_candidate = result.tree.root.get_pairs("", window) scores, matched_q, matched_c, unified_c = similarity_v03(pairs_query, pairs_candidate) result.set_unified_elements(unified_c) elif metric == 4: # modified version of 1 which performs unification ... scores, matched_q, matched_c, unified_c = similarity_v04(query.tree, result.tree, query.constraints) result.set_unified_elements(unified_c) elif metric == 5: # modified version of 4 which allows multiple sub matches scores, matched_q, matched_c, unified_c = similarity_v05(query.tree, result.tree, query.constraints) result.set_unified_elements(unified_c) result.set_matched_elements(matched_c) result.new_scores = scores query_end_time = time.time() * 1000 # RZ: ms # re-rank based on new score(s) query.sort_results() query.sort_documents() query.elapsed_time = query_end_time - query_start_time end_time = time.time() elapsed = end_time - start_time print("Elapsed Time Ranking: " + str(elapsed) + "s") #now, store the re-ranked results... out_file = open(output_filename, "w") for query in all_queries: out_file.write("\n") query.output_query(out_file) query.output_sorted_results(out_file) if html_prefix is not None: print("Saving " + query.name + " to HTML file.....") query.save_html(html_prefix) out_file.close() #if stats file is requested ... if stats_file is not None: out_file = open(stats_file, "w") out_file.write(Query.stats_header("\t")) for query in all_queries: query.output_stats(out_file,"\t", condition) out_file.close() # if times file is requested ... if times_file is not None: sorted_queries = sorted([(query.name.strip(), query) for query in all_queries]) if os.path.exists(times_file): out_file = open(times_file, "a") else: out_file = open(times_file, "w") header = "condition," + ",".join([name for (name, query) in sorted_queries]) out_file.write(header + "\n") line = condition for name, query in sorted_queries: line += "," + str(query.elapsed_time) out_file.write(line + "\n") out_file.close() print("Finished successfully")
from tangent.math.mathdocument import MathDocument from tangent.utility.control import Control from sys import argv cntl = Control('./tangent.cntl') d = MathDocument(cntl) print(d.find_doc_file(int(argv[1]))) # doc_num and pos_num print(d.find_mathml(int(argv[1]), int(argv[2]))) # doc_num and pos_num
print(" text_results\\t<file with results from text search engine>") print(" combined_results\\t<file to store combined results>") print(" combine_math\\t{'rerank' | 'average'} (mechanism for combining math results)") print(" mweight\\t0..100 (percentage of weight on formula matches)") print("and may optionally include:") print(" run\\t<arbitrary name for query run>") print("as well as other pairs.") print("") print("Optional additional command line parameters:") print("\t-w\twindow\t\t: Window for pair generation") exit() #load control file control = Control(sys.argv[1]) # control file name (after indexing) math_doc = MathDocument(control) minput_filename = control.read("math_results") tinput_filename = control.read("text_results") combiner = control.read("combine_math") mweight = control.read("mweight",num=True,default=70) output_filename = control.read("combined_results") optional_params = optional_parameters(sys.argv[2:]) window = control.read("window",num=True,default=1) if "w" in optional_params: try: w = int(optional_params["w"]) if w <= 0:
import codecs import sys from sys import argv from tangent.utility.control import Control from tangent.math.mathdocument import MathDocument __author__ = 'FWTompa' if __name__ == '__main__': if sys.stdout.encoding != 'utf8': sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer, 'strict') if sys.stderr.encoding != 'utf8': sys.stderr = codecs.getwriter('utf8')(sys.stderr.buffer, 'strict') if len(argv) != 4 or argv[1] == "help": print("Use: python get_math.py <cntl> <doc#> <expr#>") print(" where (doc# < 0) => use queryfile") sys.exit() cntl = Control(argv[1]) # control file name (after indexing) d = MathDocument(cntl) docno = int(argv[2]) exprno = int(argv[3]) print("doc " + argv[2] + ": " + d.find_doc_file(docno)) #print document file name print(d.find_mathml(docno, exprno)) # doc_num and pos_num
def get(self, fileid): """ ingest result tuples for topk responses to queries :param fileid: process id used to distinguish files :type fileid: string :return: query responses :rtype: dict mapping query_name -> CompQuery() Q queryID E search-expr R docID position expression score R docID position expression score ... Q queryID ... X """ if (self.runmode == "now"): reader = self.reader else: filename = "%s_r_%s.tsv" % (self.db, fileid) file_path = os.path.join(self.directory, filename) file = open(file_path, mode='r', encoding='utf-8', newline='') reader = csv.reader(file, delimiter='\t', lineterminator='\n', quoting=csv.QUOTE_NONE, escapechar="\\") print("Reading from math engine") doc_list = MathDocument(self.cntl) all_queries = {} for line in reader: if line: if line[0] == "Q": current_name = line[1] try: current_query = all_queries[current_name] except: current_query = CompQuery(current_name) all_queries[current_name] = current_query current_expr = None elif line[0] == "E": if current_name is None: print( "Invalid expression: Q tuple with query name expected first: " + str(line), flush=True) else: query_expression = line[1] current_expr = Query(current_name, query_expression) current_query.add_expr(current_expr) elif line[0] == "C": print("Constraint ignored: " + line) elif line[0] == "I": if current_name is None or current_expr is None: print( "Invalid information: Q tuple with query name and E tuple with expression expected first: " + str(line)) elif line[1] == "qt": current_expr.initRetrievalTime = float(line[2]) elif line[1] == "post": current_expr.postings = int(line[2]) elif line[1] == "expr": current_expr.matchedFormulae = int(line[2]) elif line[1] == "doc": current_expr.matchedDocs = int(line[2]) elif line[0] == "R": if current_name is None or current_expr is None: print( "Invalid result item: Q tuple with query name and E tuple with expression expected first: " + str(line)) else: doc_id = int(line[1]) doc_name = doc_list.find_doc_file(doc_id) if not doc_name: doc_name = "NotADoc" location = int(line[2]) expression = line[3] score = float(line[4]) current_expr.add_result(doc_id, doc_name, location, expression, score) elif line[0] == "X": break else: print("Ignoring invalid tuple: " + str(line)) print("Read " + str(len(all_queries)) + " queries") return all_queries
import codecs import sys from sys import argv from tangent.utility.control import Control from tangent.math.mathdocument import MathDocument __author__ = 'FWTompa' if __name__ == '__main__': if sys.stdout.encoding != 'utf8': sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer, 'strict') if sys.stderr.encoding != 'utf8': sys.stderr = codecs.getwriter('utf8')(sys.stderr.buffer, 'strict') if len(argv) != 4 or argv[1] == "help": print("Use: python get_math.py <cntl> <doc#> <expr#>") print(" where (doc# < 0) => use queryfile") sys.exit() cntl = Control(argv[1]) # control file name (after indexing) d = MathDocument(cntl) print(d.find_mathml(int(argv[2]),int(argv[3]))) # doc_num and pos_num