def _get_exported( fncs_list ): exported = [] prefix = env["datasets"]["export_prefix"] for m in fncs_list: if utils.uni(m).startswith(prefix): exported.append(utils.uni(m)[len(prefix):]) return exported
def _get_exported(fncs_list): exported = [] prefix = env["datasets"]["export_prefix"] for m in fncs_list: if utils.uni(m).startswith(prefix): exported.append(utils.uni(m)[len(prefix):]) return exported
def addWord(): global matrix inputs = list(json.loads(uni(request.args.get("categories")))) if inputs==[]: matrix = [] w = uni(request.args.get("word")) i = compareToCategory(w, inputs, 20, matrix) if i==-1: inputs.append([w]) else: inputs[i].append(w) return json.dumps(inputs)
def query( self, qfield, qvalue, fields=None, pages_count=10, qf=None ): """ Return search result. """ try: kwargs = {} if not fields is None: kwargs["fl"] = ",".join(fields) qvalue = _backend._escape_for_string(qvalue) kwargs["q"] = u"%s:%s" % (qfield, qvalue) if qf is not None: kwargs["qf"] = qf kwargs["rows"] = pages_count return self._backend.search(**kwargs) except Exception, e: _logger.exception(u"Could not query backend [%s]." % utils.uni(e)) _logger_suspicious.exception(u"Could not query backend [%s]." % utils.uni(e))
def load(self): cox = [] coy = [] self.ctrh = {} movies = {} for user, movie, rate, timestamp in movielens_data(): user = int(user) - 1 movie = int(movie) - 1 if movie in movies: movies[movie] += 1 else: movies[movie] = 1 if np.random.uniform(0, 1) < self.train_portion: cox.append(user) coy.append(movie) else: if user in self.ctrh: self.ctrh[user].append(movie) else: self.ctrh[user] = [movie] self.A = sparse.coo_matrix((np.ones(len(cox)), (cox, coy)), shape=(max(list(self.ctrh) + cox) + 1, max(list(movies) + coy) + 1), dtype=np.float32) self.U, self.S, self.VT = svds(self.A, self.d) self.A = self.A.astype(np.int32) for i in range(self.U.shape[0]): self.U[i] = uni(self.U[i]) self.V = self.VT.T for i in range(self.V.shape[0]): self.V[i] = uni(self.V[i]) if self.n_movies is None: self.L = len(movies) else: self.L = self.n_movies self.arms = set([ x[1] for x in sorted([(movies[movie], movie) for movie in movies])[-self.L:] ]) self.users = [ x[1] for x in sorted([(overlap(self.ctrh[user], self.arms), user) for user in self.ctrh if self.L * self.baseline[0] < overlap( self.ctrh[user], self.arms) < self.L * self.baseline[1]]) ] logging.info('total {0} users involved'.format(len(self.users))) self.d = self.d**2
def __unicode__(self): addr=[] def append_if_exists(item): if item: addr.append(uni(item)) append_if_exists(self.street) append_if_exists( u'%s %s'% (uni(self.postal_code), uni(self.city)) if self.postal_code else self.city) append_if_exists(self.county) append_if_exists(self.state) append_if_exists(self.country) if addr: return u', '.join(addr) else: return self.unformatted or ''
def query(self, qfield, qvalue, fields=None, pages_count=10, qf=None): """ Return search result. """ try: kwargs = {} if not fields is None: kwargs["fl"] = ",".join(fields) qvalue = _backend._escape_for_string(qvalue) kwargs["q"] = u"%s:%s" % (qfield, qvalue) if qf is not None: kwargs["qf"] = qf kwargs["rows"] = pages_count return self._backend.search(**kwargs) except Exception, e: _logger.exception(u"Could not query backend [%s]." % utils.uni(e)) _logger_suspicious.exception(u"Could not query backend [%s]." % utils.uni(e))
def delete( self, docs_array ): """ Return search result. """ try: self._backend.delete(docs_array) return True except Exception, e: _logger.warning(u"Could not delete from backend [%s]." % utils.uni(e))
def optimise( self ): _logger.info("Optimising index.") try: self._backend.optimize() _logger.info("Optimised.") return True except Exception, e: _logger.warning(u"Could not optimise backend [%s]." % utils.uni(e))
def search( self, query_dict, fields=None, pages_count=10 ): """ Return search result. """ try: with adapter.solr_backend(self) as backend: resp = backend.search(**query_dict) return resp except Exception, e: _logger.exception(u"Could not query backend [%s]." % utils.uni(e))
def optimise(self): _logger.info("Optimising index.") try: self._backend.optimize() _logger.info("Optimised.") return True except Exception, e: _logger.warning(u"Could not optimise backend [%s]." % utils.uni(e))
def delete(self, docs_array): """ Return search result. """ try: self._backend.delete(docs_array) return True except Exception, e: _logger.warning(u"Could not delete from backend [%s]." % utils.uni(e))
def optimise(self, maxSegments=None): _logger.info("Optimising index.") try: with adapter.solr_backend(self) as backend: backend.optimize(waitSearcher=None, maxSegments=maxSegments) _logger.info("Optimised.") return True except Exception, e: _logger.warning(u"Could not optimise backend [%s]." % utils.uni(e))
def optimise( self, maxSegments=None ): _logger.info("Optimising index.") try: with adapter.solr_backend(self) as backend: backend.optimize(waitSearcher=None, maxSegments=maxSegments) _logger.info("Optimised.") return True except Exception, e: _logger.warning(u"Could not optimise backend [%s]." % utils.uni(e))
def query_generic( self, fields=None, pages_count=10, **kwargs ): """ Return search result. """ try: with adapter.solr_backend(self) as backend: query = backend.query(**kwargs).paginate(rows=pages_count) if fields: query = query.field_limit(fields) return query.execute() except Exception, e: _logger.warning(u"Could not query backend [%s]." % utils.uni(e))
def query_generic(self, fields=None, pages_count=10, **kwargs): """ Return search result. """ try: with adapter.solr_backend(self) as backend: query = backend.query(**kwargs).paginate(rows=pages_count) if fields: query = query.field_limit(fields) return query.execute() except Exception, e: _logger.warning(u"Could not query backend [%s]." % utils.uni(e))
def commit( self ): # be sure we can write _logger.info(u"Trying to commit to index.") try: _logger.info("Committing to index.") self._backend.commit() _logger.info("Committed.") return True except Exception, e: _logger.warning(u"Could not commit in backend [%s]." % utils.uni(e))
def commit(self): # be sure we can write _logger.info(u"Trying to commit to index.") try: _logger.info("Committing to index.") self._backend.commit() _logger.info("Committed.") return True except Exception, e: _logger.warning(u"Could not commit in backend [%s]." % utils.uni(e))
def commit( self ): # be sure we can write #_logger.info("Trying to commit to index.") try: with adapter.solr_backend(self) as backend: #_logger.info("Committing to index.") backend.commit() #_logger.info("Committed.") return True except Exception, e: _logger.warning(u"Could not commit in backend [%s]." % utils.uni(e))
def delete( self, docs_array=None, queries=None ): """ Return search result. """ try: with adapter.solr_backend(self) as backend: if queries is not None: backend.delete(queries=queries) else: backend.delete(docs=docs_array) return True except Exception, e: _logger.warning(u"Could not delete from backend [%s]." % utils.uni(e))
def delete(self, docs_array=None, queries=None): """ Return search result. """ try: with adapter.solr_backend(self) as backend: if queries is not None: backend.delete(queries=queries) else: backend.delete(docs=docs_array) return True except Exception, e: _logger.warning(u"Could not delete from backend [%s]." % utils.uni(e))
def commit( self ): # be sure we can write from_time, to_time = self.not_writable adapter.check_writeable(from_time, to_time) _logger.info("Trying to commit to index.") try: with adapter.solr_backend(self) as backend: _logger.info("Committing to index.") backend.commit() _logger.info("Committed.") return True except Exception, e: _logger.warning(u"Could not commit in backend [%s]." % utils.uni(e))
def add(self, document_s, boosts=None): """ Add a document to index. """ docs = utils.to_array(document_s) try: for document in docs: document.update({"id": self.get_id(document["latex"])}) self._backend.add(docs, boosts) if self.auto_commit: self.commit() return True except Exception, e: _logger.exception(u"Could not add document to index\n[%s].", utils.uni(e)) return False
def commit(self): # be sure we can write from_time, to_time = self.not_writable adapter.check_writeable(from_time, to_time) _logger.info("Trying to commit to index.") try: with adapter.solr_backend(self) as backend: _logger.info("Committing to index.") backend.commit() _logger.info("Committed.") return True except Exception, e: _logger.warning(u"Could not commit in backend [%s]." % utils.uni(e))
def load(self): cox = [] coy = [] self.ctrh = {} movies = {} for user, movie, rate, timestamp in movielens_data(): user = int(user) - 1 movie = int(movie) - 1 if movie in movies: movies[movie] += 1 else: movies[movie] = 1 if np.random.uniform(0,1) < self.train_portion: cox.append(user) coy.append(movie) else: if user in self.ctrh: self.ctrh[user].append(movie) else: self.ctrh[user] = [movie] self.A = sparse.coo_matrix((np.ones(len(cox)), (cox, coy)), shape=(max(list(self.ctrh) + cox) + 1, max(list(movies) + coy) + 1), dtype=np.float32) self.U, self.S, self.VT = svds(self.A, self.d) self.A = self.A.astype(np.int32) for i in range(self.U.shape[0]): self.U[i] = uni(self.U[i]) self.V = self.VT.T for i in range(self.V.shape[0]): self.V[i] = uni(self.V[i]) if self.n_movies is None: self.L = len(movies) else: self.L = self.n_movies self.arms = set([x[1] for x in sorted([(movies[movie], movie) for movie in movies])[-self.L:]]) self.users = [x[1] for x in sorted([(overlap(self.ctrh[user], self.arms), user) for user in self.ctrh if self.L * self.baseline[0] < overlap(self.ctrh[user], self.arms) < self.L * self.baseline[1]])] logging.info('total {0} users involved'.format(len(self.users))) self.d = self.d ** 2
def _create_doc( self, latex, mathml, convert_js, docs=None, url=None, dataset=None, create_ego=False ): doc = { "mathml": utils.uni(mathml), "latex": latex, "latex_len": len(latex), "documents": docs, "url": url, "dataset": utils.to_array(dataset), } for k in ( "result", "status", "status_code", "log" ): if k in convert_js: doc[k] = convert_js[k] if create_ego: doc["ego_math"] = ego_convert(latex, mathml[-1]) return doc
def add( self, document_s, boosts=None ): """ Add a document to index. """ docs = utils.to_array(document_s) try: for document in docs: document.update({ "id": self.get_id(document["latex"]) }) self._backend.add(docs, boosts) if self.auto_commit: self.commit() return True except Exception, e: _logger.exception(u"Could not add document to index\n[%s].", utils.uni(e)) return False
def add( self, document, **kwargs ): """ Add a document to index. """ exc = None try: with adapter.solr_backend(self) as backend: backend.add( [document], **kwargs ) if self.auto_commit: self.commit() return True except Exception, e: import traceback traceback.print_exc(file=sys.__stdout__) exc = e self.errors += 1 if self.errors > 0: self.errors = 0 _logger.exception(u"Could not add document to index [%s]\n[%s].", document.id_str, utils.uni(e)) return False
def add(self, document, **kwargs): """ Add a document to index. """ exc = None try: with adapter.solr_backend(self) as backend: backend.add([document.dict()], **kwargs) if self.auto_commit: self.commit() return True except Exception, e: import traceback traceback.print_exc(file=sys.__stdout__) exc = e self.errors += 1 if self.errors > 0: self.errors = 0 _logger.exception( u"Could not add document to index [%s]\n[%s].", document._values.get(self.id_str, "no id in document values"), utils.uni(e)) return False
def _create_doc(self, latex, mathml, convert_js, docs=None, url=None, dataset=None, create_ego=False): doc = { "mathml": utils.uni(mathml), "latex": latex, "latex_len": len(latex), "documents": docs, "url": url, "dataset": utils.to_array(dataset), } for k in ("result", "status", "status_code", "log"): if k in convert_js: doc[k] = convert_js[k] if create_ego: doc["ego_math"] = ego_convert(latex, mathml[-1]) return doc
def process( env_dict, ftor_to_call, final_ftor ): """ Index function wrapper around analyse_one_page and commit_to_index. It either calls them in parallel or sequentially. """ # global exit signaller exit_ = multiprocessing.Array(ctypes.c_int, 1, lock=False) exit_[0] = 0 logger.info(u"Reading input from [%s]", env_dict["input"]) done_set = set() if env_dict["indexer"].get( "continue", False ): fname = env_dict["indexer"]["continue"] if os.path.exists(fname): done_set = set( [ utils.uni(x).strip().lower() for x in codecs.open( fname, encoding="utf-8", mode="r", errors="ignore" ).readlines() ] ) def iparameters(): """ Get iterable params. """ i = 0 for file_ in glob.iglob(env_dict["input"]): file_ = os.path.abspath(file_) if file_.lower() in done_set: continue no_go = False file_basename = os.path.basename(file_) for not_acceptable_start in env_dict["exclude"]["file_starts"]: if file_basename.startswith(not_acceptable_start): logger.warning(u"Skipping this file (invalid title start) [%s]", file_basename) no_go = True if no_go: continue i += 1 yield ( env_dict, i, file_ ) # create pool of slaves if specified # if env_dict["parallel"]["enabled"] and not env_dict.debug: # parallel version # - threaded # - processed # max_parallel = env_dict["parallel"]["max"] kwargs = {} if env_dict["parallel"]["threads"]: logger.info("Using threading pool with [%d] max concurrent threads.", max_parallel) Pool = ThreadPool else: Pool = multiprocessing.Pool kwargs = {} if "maxtasksperchild" in env_dict["parallel"]: kwargs["maxtasksperchild"] = env_dict["parallel"]["maxtasksperchild"] logger.info("Using process pool with [%d] max concurrent processes, chunk size [%s], [%s].", max_parallel, env_dict["parallel"]["chunksize"], repr(kwargs)) slaves = Pool(processes=max_parallel, initializer=init_pool, initargs=(exit_,), **kwargs) # loop through all files and it = slaves.imap(ftor_to_call, iparameters(), chunksize=env_dict["parallel"]["chunksize"]) slaves.close() for _ in it: if exit_[0]: break utils.print_after.step() slaves.join() # not parallel version else: logger.info("Executing non parallel version [%s]", "debug=True" if env_dict.debug else "parallel.enabled=False") init_pool(exit_) for (env, pos, file_) in iparameters(): ftor_to_call((env_dict, pos, file_)) utils.print_after.step() if pos >= env.count: break # final break # if not final_ftor is None: final_ftor(env_dict)
# vs <annotation id="p1.1.m1.1b" encoding="application/x-tex" xref="p1.1.m1.1.cmml">w</annotation> if _math_parser.re_tex_annotation.search(mathml_text) is None: # this can mean that we either do not have mathml for end_tag in ( "</math>", "</m:math>" ): if mathml_text.endswith(end_tag): # we have mathml but no annotation if end_tag == "</math>": annotation = u"<annotation encoding=\"application/x-tex\">%s</annotation>" % latex_math else: annotation = u"<m:annotation encoding=\"application/x-tex\">%s</m:annotation>" % latex_math mathml_text = mathml_text[:-len(end_tag)] + annotation + mathml_text[-len(end_tag):] break mathml_text = utils.uni(mathml_text) # post processing # - converter problem # for to_remove in ( u"\end{document}", u"nowiki" ): if to_remove in mathml_text: logger.warn(u"Invalid math [%s]", mathml_text) # set to invalid mathml_pickled.delete_invalid( latex_math_db_id ) if try_one_more_if_invalid: return convert_wikimath_to_realmath( env_dict, wiki_math_match, mathml_pickled, url, doc,
def process(env_dict, ftor_to_call, final_ftor): """ Index function wrapper around analyse_one_page and commit_to_index. It either calls them in parallel or sequentially. """ # global exit signaller exit_ = multiprocessing.Array(ctypes.c_int, 1, lock=False) exit_[0] = 0 logger.info(u"Reading input from [%s]", env_dict["input"]) done_set = set() if env_dict["indexer"].get("continue", False): fname = env_dict["indexer"]["continue"] if os.path.exists(fname): done_set = set([ utils.uni(x).strip().lower() for x in codecs.open(fname, encoding="utf-8", mode="r", errors="ignore").readlines() ]) def iparameters(): """ Get iterable params. """ i = 0 for file_ in glob.iglob(env_dict["input"]): file_ = os.path.abspath(file_) if file_.lower() in done_set: continue no_go = False file_basename = os.path.basename(file_) for not_acceptable_start in env_dict["exclude"]["file_starts"]: if file_basename.startswith(not_acceptable_start): logger.warning( u"Skipping this file (invalid title start) [%s]", file_basename) no_go = True if no_go: continue i += 1 yield (env_dict, i, file_) # create pool of slaves if specified # if env_dict["parallel"]["enabled"] and not env_dict.debug: # parallel version # - threaded # - processed # max_parallel = env_dict["parallel"]["max"] kwargs = {} if env_dict["parallel"]["threads"]: logger.info( "Using threading pool with [%d] max concurrent threads.", max_parallel) Pool = ThreadPool else: Pool = multiprocessing.Pool kwargs = {} if "maxtasksperchild" in env_dict["parallel"]: kwargs["maxtasksperchild"] = env_dict["parallel"][ "maxtasksperchild"] logger.info( "Using process pool with [%d] max concurrent processes, chunk size [%s], [%s].", max_parallel, env_dict["parallel"]["chunksize"], repr(kwargs)) slaves = Pool(processes=max_parallel, initializer=init_pool, initargs=(exit_, ), **kwargs) # loop through all files and it = slaves.imap(ftor_to_call, iparameters(), chunksize=env_dict["parallel"]["chunksize"]) slaves.close() for _ in it: if exit_[0]: break utils.print_after.step() slaves.join() # not parallel version else: logger.info( "Executing non parallel version [%s]", "debug=True" if env_dict.debug else "parallel.enabled=False") init_pool(exit_) for (env, pos, file_) in iparameters(): ftor_to_call((env_dict, pos, file_)) utils.print_after.step() if pos >= env.count: break # final break # if not final_ftor is None: final_ftor(env_dict)
class mathml(object): """ MathML object. """ url_form_latex = settings["converters"]["latexml"]["url"] encoding = settings["converters"]["encoding"] id_str = u' egomath="%s" ' pattern_id_add = re.compile( u'(<math)\s(.*?xmlns="http://www.w3.org/1998/Math/MathML")') pattern_id_get = re.compile(id_str % u'(.*?)') def __init__(self, mathml_str): self.str = mathml_str @staticmethod def from_latex(latex_math_orig): """ Returns either mathml object or None. """ # try fetching the answer js = None latex_math = latex_math_orig try: latex_math = latex(latex_math, full=False).str # is empty? if len(latex_math.strip()) == 0: _logger.warning(u"Warning: empty math - [%s]", repr(latex_math)) return None, None latex_math = u"$ %s $" % latex_math # old service req = urllib2.Request( # URL, urllib.urlencode({ 'formula' : utils.ascii(latex,DEF_ENCODING) }) ) # new service req = urllib2.Request( mathml.url_form_latex, urllib.urlencode({ 'tex': latex_math.encode("utf-8"), 'profile': 'math', })) response = urllib2.urlopen( req, timeout=settings["converters"]["latexml"]["timeout"]) # try parsing the answer import json js = json.load(response) result = js[settings["converters"]["latexml"]["result_field"]] message = js[settings["converters"]["latexml"]["status_field"]] if result: result = result.encode(mathml.encoding) if message: message = message.encode(mathml.encoding) except Exception, e: if js is None: # fake js js = { "result": None, "status": "Problem at early stage.", "status_code": -1, "log": repr(e), } _logger.error(u"Error: Connection problem - %s with [%s]", repr(e), latex_math) return None, js everything_ok = False for msg in settings["converters"]["latexml"]["status_ok"]: if msg in message: everything_ok = not message is None and 0 < len(message) break not_empty_result = result and result != '' # everything ok - return answer if everything_ok and not_empty_result: return mathml(result).str, js # something fishy - try to correct it ascii_latex = utils.ascii(latex_math, mathml.encoding) if everything_ok and not_empty_result and len(ascii_latex) < 6: # in case the service returns empty string and it seems to be just a variable _logger.warning(u"Warning: returning original - %s", repr(ascii_latex)) return mathml(ascii_latex).str, js # seems not ok but the latest converter returns valid results if not everything_ok and not_empty_result: _logger.warning( u"Warning: returning conversion but with errors - %s", repr(ascii_latex)) return mathml(result).str, js _logger.error( u"\n!ERROR - converting [%s] -> result [%s] with message [%s]\n%s", ascii_latex, utils.uni(result), utils.uni(message), 40 * "=") return None, js
def execute(self, *args, **kwargs): """ Execute the application. """ ret_mime = None #self.log( "args [%s], kwargs [%s]", repr(args), repr(kwargs)) if not cesilko.api_translate in args: return self._failed( detail="Invalid API - no method with such a name") # posted raw body if cesilko.api_key_body in kwargs: ret_mime = "text/plain" try: kwargs[cesilko.api_key_data] = utils.uni(self.posted_body()) except: return self._failed(detail="invalid posted body") # what should we translate? elif 0 == len(kwargs.get(cesilko.api_key_data, "")): # hardcoded fallback try: kwargs[cesilko.api_key_data] = utils.uni(self.posted_body()) ret_mime = "text/plain" self.log("using fallback mechanism") except: return self._failed(detail="missing data parameter") try: (input_f, input_fname_rel) = self._get_unique_file(enc='iso-8859-2') expected_output_file_name = input_f.name + ".SK.out" # 1. Input text is in UTF-8 text = kwargs[cesilko.api_key_data] self.log("Received Input Text: %s ", text) self.log("Type of the Input: %s", str(type(text))) # 2. Convert the UTF-8 encoded text into ISO-8859-2 encoding. # - non ISO-8859-2 characters will be replaced with XML numeric codes text_iso_dec = None try: text_iso = text.encode('iso-8859-2', 'xmlcharrefreplace') text_iso_dec = text_iso.decode('iso-8859-2') # ISO-8859-2 text self.log( "Replacing the Non ISO-8859-2 Characters Into XML Numeric Entities: %s", text_iso_dec) except UnicodeEncodeError: self._failed(detail="please supply utf-8 input.") with input_f as fout: fout.write(text_iso_dec) self.log("Written Input Text to File: %s", fout.name) cmd = "%s %s %s" % (cesilko.tr_script, input_f.name, expected_output_file_name) self.log("Cesilko ran: [%s]", cmd) retcode, stdout, stderr = utils.run(cmd) output_exists = os.path.exists(expected_output_file_name) if 0 == retcode and os.path.exists(expected_output_file_name): with open(expected_output_file_name, 'rb') as fin: translated_text = fin.read() # convert the ISO-8859-2 output text into UTF-8 text #translated_text_dec_utf = translated_text.decode('iso-8859-2').encode('utf-8').decode('utf-8') translated_text_dec_utf = translated_text.decode( 'iso-8859-2') # remove extra \n\n at the end of the translated text # Cesilko adds this, so it can be removed safely here translated_text_dec_utf = re.sub(r"\n\n$", "", translated_text_dec_utf) # remove extra spaces at the beginning and end translated_text_dec_utf = re.sub(r"(^\s+|\s+$)", "", translated_text_dec_utf) self.log("The UTF-8 Encoded Output: %s", translated_text_dec_utf) ret = {"input": text, "result": translated_text_dec_utf} # special for weblicht if ret_mime is not None: return ret_mime, ret["result"] return ret else: return self._failed( detail= "retcode:%d, exists(%s)=%s, stdout=%s, stderr=%s, cmd=%s" % (retcode, expected_output_file_name, output_exists, stdout, stderr, cmd)) except Exception, e: return self._failed(detail=utils.uni(e))
""" Convert both formats. """ from indexer.egomath.interface import egomath_inst mathml_repre = u"mathml:problem" tex_repre = u"tex:problem" mathmldone = False # noinspection PyUnusedLocal try: if not mathml_str is None and \ len(mathml_str) > 0 and \ mathml_str != mathdb.failed_mathml: mathml_repre = egomath_inst.math_from_mathml(mathml_str) mathmldone = True if not latex_str is None and \ len(latex_str) > 0: tex_repre = egomath_inst.math_from_tex(latex_str) latex_str_cleanup = egomath_inst.math_from_tex_cleanup(latex_str) if latex_str_cleanup != latex_str: _logger.info(u"Changed\n[%s] to\n[%s]", latex_str, latex_str_cleanup) pass else: sys.exit("Fatal error") except Exception, e: _logger.exception("%s exception [%s]", "MathML" if mathmldone is False else "TeX", latex_str) return u"mathml:\n" + utils.uni(mathml_repre), u"tex:\n" + utils.uni(tex_repre)
# <m:math display="inline"><m:semantics><m:mi>c</m:mi><m:annotation-xml encoding="MathML-Content"><m:ci>c</m:ci></m:annotation-xml></m:semantics></m:math> # vs <annotation id="p1.1.m1.1b" encoding="application/x-tex" xref="p1.1.m1.1.cmml">w</annotation> if _math_parser.re_tex_annotation.search(mathml_text) is None: # this can mean that we either do not have mathml for end_tag in ("</math>", "</m:math>"): if mathml_text.endswith(end_tag): # we have mathml but no annotation if end_tag == "</math>": annotation = u"<annotation encoding=\"application/x-tex\">%s</annotation>" % latex_math else: annotation = u"<m:annotation encoding=\"application/x-tex\">%s</m:annotation>" % latex_math mathml_text = mathml_text[:-len( end_tag)] + annotation + mathml_text[-len(end_tag):] break mathml_text = utils.uni(mathml_text) # post processing # - converter problem # for to_remove in (u"\end{document}", u"nowiki"): if to_remove in mathml_text: logger.warn(u"Invalid math [%s]", mathml_text) # set to invalid mathml_pickled.delete_invalid(latex_math_db_id) if try_one_more_if_invalid: return convert_wikimath_to_realmath( env_dict, wiki_math_match, mathml_pickled, url,
""" from indexer.egomath.interface import egomath_inst mathml_repre = u"mathml:problem" tex_repre = u"tex:problem" mathmldone = False # noinspection PyUnusedLocal try: if not mathml_str is None and \ len(mathml_str) > 0 and \ mathml_str != mathdb.failed_mathml: mathml_repre = egomath_inst.math_from_mathml(mathml_str) mathmldone = True if not latex_str is None and \ len(latex_str) > 0: tex_repre = egomath_inst.math_from_tex(latex_str) latex_str_cleanup = egomath_inst.math_from_tex_cleanup(latex_str) if latex_str_cleanup != latex_str: _logger.info(u"Changed\n[%s] to\n[%s]", latex_str, latex_str_cleanup) pass else: sys.exit("Fatal error") except Exception, e: _logger.exception("%s exception [%s]", "MathML" if mathmldone is False else "TeX", latex_str) return u"mathml:\n" + utils.uni(mathml_repre), u"tex:\n" + utils.uni( tex_repre)
def page_to_template( self, page, template ): # get basic info # pattern = re.compile( r'<title>(?P<title>.*?)</title>.*<id>(?P<id>.*?)</id>.*<revision.*<text[^>]*>(?P<text>.*?)</text>', re.DOTALL) keywords = { u"title": None, u"id": None, u"text": None, } m = pattern.search(page) if not m: pager.logger.error(u"Invalid page: could not find elements... %s", page) return template else: assert page.count( u"<title>" ) == 1, u"Matched more pages?" for k, v in keywords.iteritems(): keywords[k] = m.group(k) if utils.uni(keywords[u"id"]) in settings["exclude"]["ids"]: logger_suspicious.warning(u"Skipping this file (id in excludes)... %s [%s]", keywords[u"title"], keywords[u"id"]) return None for not_acceptable_start in settings["exclude"]["title_starts"]: if keywords[u"title"].startswith(not_acceptable_start): logger_suspicious.warning(u"Skipping this file (invalid title start)... %s [%s]", keywords[u"title"], keywords[u"id"]) return None # clean up text # - get math positions (do not clean text inside them) # - split text and clean up tokens between math # from _parser import parser as wikiparser text = wikiparser.remove_wiki_tags_outside_math(keywords[u"text"]) keywords[u"text"] = text # get additional info # category = re.compile(r'\[\[Category:(.+)\]\]').findall(page) keywords["category"] = self.delimiter.join(map(lambda x: x.replace(self.delimiter, " "), category)) keywords["url"] = u"http://en.wikipedia.org/wiki/%s" % keywords[u"title"].replace(u" ", u"_") # # lang_avail = [] lines = page.strip().split("\n") lang_pattern = re.compile(r'\[\[([a-z].+?):.*\]\]') for i in range(len(lines) - 1, 0, -1): m = lang_pattern.match(lines[i]) if m: lang_avail.append(m.group(1)) else: if lines[i].strip() == "": break elif not lines[i].strip().startswith(u"<"): break keywords["lang_avail"] = self.delimiter.join([x.replace(self.delimiter, " ") for x in lang_avail]) # # keywords["citations_count"] = page.count(u"<ref>") refs = u"" # problems: # [[File:Albedo-e hg.svg|thumb|Percentage of diffusely reflected sun light in # relation to various surface conditions of the Earth]] # # def change_to_link( text ): """ Very simple text to link changer. """ text = text.split(u"#")[0] if len(text) > 0: text = text[0].upper() + text[1:].replace(u" ", "_") return text return u"" for cita in re.compile(r'\[\[([^:\]|]+?)\]\]').findall(page): refs += '<meta name="refs" content="%s" />\n' % change_to_link(cita) for cita in re.compile(r'\[\[([^:|]+?)\]\]').findall(page): refs += '<meta name="refs" content="%s" />\n' % change_to_link(cita) keywords["refs"] = refs page = None # memory # substitute it # m = None # memory try: return utils.subst_str_nonrecursive(template, keywords) except MemoryError, e: self.logger.exception(u"Memory exception - %s", repr(e))
def parse(self, text, page_str): global _egomath_inst # if math.mathml_db is None: # math.mathml_db = _math.mathdb( _settings_local ) id_str = page_str[180:260].replace("\n", " ") math_field = [] positions = parser.positions_in_text( page_str, (_settings_local["pager"]["wiki_mathml_tags"], False), (_settings_local["pager"]["wiki_mathml_tags_v2"], False), (_settings_local["pager"]["wiki_math_tags"], True), ) positions.sort(key=lambda x: x[0]) tex_start = _settings_local["pager"]["tex_start"] math_sep = _settings_local["pager"]["math_sep"] # find all maths in the parsed page and match them with their # counterpart in the original text (get either tex or mathml) # for i, match in enumerate(math.math_final_matcher.finditer(text)): start_pos = max( match.start(0) - _settings_global["indexer"]["snippet_chars"] / 2, 0) end_pos = min( match.end(0) + _settings_global["indexer"]["snippet_chars"], len(text) - 1) # invalid math if len(positions) <= i: pass if positions[i][0] == -1: continue math_text = positions[i][1] math_representation = u"" #logger.info( u"Working on [%s] len [%s][%s]", math_text, i, len(math_text) ) if len(math_text) > 50000: logger_suspicious.warning(u"Math too big [%s] in [%s]", len(math_text), id_str) pass if len(math_text) > 0: try: # mathml version # if math_text.startswith( u"<m:math") or math_text.startswith(u"<math"): math_tex_tmp = None _bug_oneword = False m = math.re_tex_annotation.search(math_text) if not m: logger_suspicious.warning( "Did not find annotation in tex! [%s]", id_str) else: # do a bit of html->normal cleanup math_tex_tmp = unescape_recursive(m.group(1)) # must be after unescape math_tex_tmp = self.texify(math_tex_tmp) _bug_oneword = re.compile(u"^[a-z]+$", re.U).match( math_tex_tmp.lower()) math_text = math_text.replace( m.group(1), math_tex_tmp) math_representation = _egomath_inst.math_from_mathml( math_text) # FIX one word if _bug_oneword and math_representation.count("*") > 0: math_representation = u"Tex: %s\nego0 : %s\nego8 : id" % ( math_tex_tmp, math_tex_tmp) logger_suspicious.debug(u"Fixing oneword [%s]", math_tex_tmp) # Log problem if math_representation is None: logger_suspicious.warning( u"NULL returned from egomath [\n%s\n] in [%s]", math_text.replace("\n", " ").replace("\"", "\\\""), id_str) if math_tex_tmp is not None and ( math_representation is None or 0 == len(math_representation)): # try latex... math_representation = _egomath_inst.math_from_tex( math_tex_tmp) if math_representation is None or 0 == len( math_representation): logger_suspicious.warning( u"Empty math returned from egomath [\n%s\n] in [%s]", math_text.replace("\n", " ").replace("\"", "\\\""), id_str) # if not math.mathml_db is None: # i = mathml.get_id( math_text ) # if not i is None: # # add it to db # math.mathml_db.add_ego_math( math_representation, i ) # tex version # else: # do a bit of html->normal cleanup math_text = unescape_recursive(math_text) math_text = self.texify(math_text, leave_nl=True) # do the conversion math_representation = _egomath_inst.math_from_tex( math_text) # simulate text repre math_text_tex = math_text.replace("\n", " ").strip() math_representation = "Tex: %s\n" % math_text_tex + math_representation except Exception, e: logger.exception(u"Cannot convert [%s] because of [%s]", math_text, utils.uni(e)) else: logger_suspicious.info(u"Empty math in [%s]", id_str) if math_representation is None or len(math_representation) == 0: continue # convert Tex: line to ""showaeble"" tex # convert \d: line to math + end token so we can simulate full match # already_in = set() result_math = u"" for line in math_representation.split("\n"): line = line.strip() if len(line) == 0: continue # deduplicity - already there if line in already_in: continue already_in.add(line) # insert proper tags if not line.startswith(tex_start): line = u"%s %s" % (line, math_sep[1]) else: line = u"%s %s %s %s" % (tex_start, math_sep[0], line[len(tex_start):], math_sep[1]) result_math += u"%s\n" % line # ensure we get all formulae unparsed # if len(result_math) > 0: #math_field += self.snippet_populate(text, result_math, match, start_pos, end_pos) math_field += [ self.snippet_populate(text, result_math, match, start_pos, end_pos) ]
def parse(self, text, page_str): global _egomath_inst # if math.mathml_db is None: # math.mathml_db = _math.mathdb( _settings_local ) id_str = page_str[180:260].replace("\n", " ") math_field = [] positions = parser.positions_in_text(page_str, (_settings_local["pager"]["wiki_mathml_tags"], False), (_settings_local["pager"]["wiki_mathml_tags_v2"], False), (_settings_local["pager"]["wiki_math_tags"], True), ) positions.sort(key=lambda x: x[0]) tex_start = _settings_local["pager"]["tex_start"] math_sep = _settings_local["pager"]["math_sep"] # find all maths in the parsed page and match them with their # counterpart in the original text (get either tex or mathml) # for i, match in enumerate(math.math_final_matcher.finditer(text)): start_pos = max(match.start(0) - _settings_global["indexer"]["snippet_chars"] / 2, 0) end_pos = min(match.end(0) + _settings_global["indexer"]["snippet_chars"], len(text) - 1) # invalid math if len(positions) <= i: pass if positions[i][0] == -1: continue math_text = positions[i][1] math_representation = u"" #logger.info( u"Working on [%s] len [%s][%s]", math_text, i, len(math_text) ) if len(math_text) > 50000: logger_suspicious.warning(u"Math too big [%s] in [%s]", len(math_text), id_str) pass if len(math_text) > 0: try: # mathml version # if math_text.startswith(u"<m:math") or math_text.startswith(u"<math"): math_tex_tmp = None _bug_oneword = False m = math.re_tex_annotation.search(math_text) if not m: logger_suspicious.warning("Did not find annotation in tex! [%s]", id_str) else: # do a bit of html->normal cleanup math_tex_tmp = unescape_recursive(m.group(1)) # must be after unescape math_tex_tmp = self.texify(math_tex_tmp) _bug_oneword = re.compile(u"^[a-z]+$", re.U).match(math_tex_tmp.lower()) math_text = math_text.replace(m.group(1), math_tex_tmp) math_representation = _egomath_inst.math_from_mathml(math_text) # FIX one word if _bug_oneword and math_representation.count("*") > 0: math_representation = u"Tex: %s\nego0 : %s\nego8 : id" % (math_tex_tmp, math_tex_tmp) logger_suspicious.debug(u"Fixing oneword [%s]", math_tex_tmp) # Log problem if math_representation is None: logger_suspicious.warning(u"NULL returned from egomath [\n%s\n] in [%s]", math_text.replace("\n", " ").replace("\"", "\\\""), id_str) if math_tex_tmp is not None and (math_representation is None or 0 == len(math_representation)): # try latex... math_representation = _egomath_inst.math_from_tex(math_tex_tmp) if math_representation is None or 0 == len(math_representation): logger_suspicious.warning(u"Empty math returned from egomath [\n%s\n] in [%s]", math_text.replace("\n", " ").replace("\"", "\\\""), id_str) # if not math.mathml_db is None: # i = mathml.get_id( math_text ) # if not i is None: # # add it to db # math.mathml_db.add_ego_math( math_representation, i ) # tex version # else: # do a bit of html->normal cleanup math_text = unescape_recursive(math_text) math_text = self.texify(math_text, leave_nl=True) # do the conversion math_representation = _egomath_inst.math_from_tex(math_text) # simulate text repre math_text_tex = math_text.replace("\n", " ").strip() math_representation = "Tex: %s\n" % math_text_tex + math_representation except Exception, e: logger.exception(u"Cannot convert [%s] because of [%s]", math_text, utils.uni(e)) else: logger_suspicious.info(u"Empty math in [%s]", id_str) if math_representation is None or len(math_representation) == 0: continue # convert Tex: line to ""showaeble"" tex # convert \d: line to math + end token so we can simulate full match # already_in = set() result_math = u"" for line in math_representation.split("\n"): line = line.strip() if len(line) == 0: continue # deduplicity - already there if line in already_in: continue already_in.add(line) # insert proper tags if not line.startswith(tex_start): line = u"%s %s" % (line, math_sep[1]) else: line = u"%s %s %s %s" % (tex_start, math_sep[0], line[len(tex_start):], math_sep[1]) result_math += u"%s\n" % line # ensure we get all formulae unparsed # if len(result_math) > 0: #math_field += self.snippet_populate(text, result_math, match, start_pos, end_pos) math_field += [self.snippet_populate(text, result_math, match, start_pos, end_pos)]
def append_if_exists(item): if item: addr.append(uni(item))