def get(self, doc_id, location, expression): if not doc_id in self.cached_locations: self.cached_locations[doc_id] = {} if location in self.cached_locations[doc_id]: return self.cached_locations[doc_id][location] else: #first time the expression is seen, check.... if expression in self.cached_expressions: #expression has been retrieved before but at different location... prev_doc_id, prev_location = self.cached_expressions[expression] return self.cached_locations[prev_doc_id][prev_location] else: control = Control(self.control_filename) # control file name (after indexing) document_finder = MathDocument(control) mathml = document_finder.find_mathml(doc_id, location) mathml = MathExtractor.isolate_pmml(mathml) if isinstance(mathml, bytes): mathml = mathml.decode('UTF-8') # save on cache... self.cached_locations[doc_id][location] = mathml self.cached_expressions[expression] = (doc_id, location) return mathml
def find_formula_ids(tsv_results, control_filename): control = Control(control_filename) document_finder = MathDocument(control) for query_offset in tsv_results: print("Processing Query: " + str(query_offset)) total_locs = len(tsv_results[query_offset]["results"]) for index, result in enumerate(tsv_results[query_offset]["results"]): doc, loc = result mathml = document_finder.find_mathml(doc, loc) elem_content = io.StringIO(mathml) # treat the string as if a file root = xml.etree.ElementTree.parse(elem_content).getroot() if "id" in root.attrib: math_id = root.attrib["id"] else: print("ERROR: No formula id found for Query " + str(query_offset) + ", doc = " + str(doc) + ", loc = " + str(loc)) math_id = "math.error" #print(str((query_offset, doc, loc, math_id))) tsv_results[query_offset]["math_ids"].append(math_id) if index > 0 and (index + 1) % 25 == 0: print("... done " + str(index + 1) + " of " + str(total_locs))
def get(self, doc_id, location, expression, force_update=False): if not doc_id in self.cached_locations: self.cached_locations[doc_id] = {} if location in self.cached_locations[doc_id] and not force_update: return self.cached_locations[doc_id][location] else: #first time the expression is seen, check.... if expression in self.cached_expressions and not force_update: #expression has been retrieved before but at different location... prev_doc_id, prev_location = self.cached_expressions[ expression] return self.cached_locations[prev_doc_id][prev_location] else: control = Control(self.control_filename ) # control file name (after indexing) document_finder = MathDocument(control) mathml = document_finder.find_mathml(doc_id, location) mathml = MathExtractor.isolate_pmml(mathml) if isinstance(mathml, bytes): mathml = mathml.decode('UTF-8') # save on cache... self.cached_locations[doc_id][location] = mathml self.cached_expressions[expression] = (doc_id, location) return mathml
import codecs import sys from sys import argv from tangent.utility.control import Control from tangent.math.mathdocument import MathDocument __author__ = 'FWTompa' if __name__ == '__main__': if sys.stdout.encoding != 'utf8': sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer, 'strict') if sys.stderr.encoding != 'utf8': sys.stderr = codecs.getwriter('utf8')(sys.stderr.buffer, 'strict') if len(argv) != 4 or argv[1] == "help": print("Use: python get_math.py <cntl> <doc#> <expr#>") print(" where (doc# < 0) => use queryfile") sys.exit() cntl = Control(argv[1]) # control file name (after indexing) d = MathDocument(cntl) docno = int(argv[2]) exprno = int(argv[3]) print("doc " + argv[2] + ": " + d.find_doc_file(docno)) #print document file name print(d.find_mathml(docno, exprno)) # doc_num and pos_num
import codecs import sys from sys import argv from tangent.utility.control import Control from tangent.math.mathdocument import MathDocument __author__ = 'FWTompa' if __name__ == '__main__': if sys.stdout.encoding != 'utf8': sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer, 'strict') if sys.stderr.encoding != 'utf8': sys.stderr = codecs.getwriter('utf8')(sys.stderr.buffer, 'strict') if len(argv) != 4 or argv[1] == "help": print("Use: python get_math.py <cntl> <doc#> <expr#>") print(" where (doc# < 0) => use queryfile") sys.exit() cntl = Control(argv[1]) # control file name (after indexing) d = MathDocument(cntl) print(d.find_mathml(int(argv[2]),int(argv[3]))) # doc_num and pos_num