def dump_tokens(limit): """ Iterate through parsed articles and print a list of tokens and their matched terminals """ dtd = dict() with closing(BIN_Db.get_db()) as db: with SessionContext(commit = True) as session: # Iterate through the articles q = session.query(Article) \ .filter(Article.tree != None) \ .order_by(Article.timestamp) if limit is None: q = q.all() else: q = q[0:limit] for a in q: print("\nARTICLE\nHeading: '{0.heading}'\nURL: {0.url}\nTimestamp: {0.timestamp}".format(a)) tree = TreeTokenList() tree.load(a.tree) for ix, toklist in tree.sentences(): print("\nSentence {0}:".format(ix)) at_start = True for t in toklist: if t.tokentype == "WORD": wrd = t.token[1:-1] td = dtd.get(t.terminal) if td is None: td = TerminalDescriptor(t.terminal) dtd[t.terminal] = td stem = td.stem(db, wrd, at_start) at_start = False print(" {0} {1} {2}".format(wrd, stem, t.terminal)) else: print(" {0.token} {0.cat} {0.terminal}".format(t))
def top_persons(limit=_TOP_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() bindb = BIN_Db.get_db() with SessionContext(commit=True) as session: q = session.query(Person.name, Person.title, Person.article_url, Article.id) \ .join(Article).join(Root) \ .filter(Root.visible) \ .order_by(desc(Article.timestamp))[0:limit * 2] # Go through up to 2 * N records for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or len(p.title) > len(toplist[p.name][0]): toplist[p.name] = (correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name)) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted([ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]))
def execute(self): """ Execute the query contained in the previously parsed tree; return True if successful """ if self._tree is None: self.set_error("E_QUERY_NOT_PARSED") return False self._error = None self._qtype = None with closing(BIN_Db.get_db()) as bin_db: # Process the tree, which has only one sentence self._tree.process(self._session, _THIS_MODULE, bin_db, query=self) return self._error is None
def go_single(self, url): """ Single article processor that will be called by a process within a multiprocessing pool """ print("Processing article {0}".format(url)) sys.stdout.flush() # Load the article with closing(self._db.session) as session: try: article = session.query(Article).filter_by( url=url).one_or_none() if article is None: print("Article not found in scraper database") else: if article.tree: tree = Tree(url, article.authority) # print("Tree:\n{0}\n".format(article.tree)) tree.load(article.tree) with closing(BIN_Db.get_db()) as bin_db: # Run all processors in turn for p in self.processors: tree.process(session, p, bin_db) # Mark the article as being processed article.processed = datetime.utcnow() # So far, so good: commit to the database session.commit() except Exception as e: # If an exception occurred, roll back the transaction session.rollback() print( "Exception in article {0}, transaction rolled back\nException: {1}" .format(url, e)) raise sys.stdout.flush()
def execute(self): """ Execute the query contained in the previously parsed tree; return True if successful """ if self._tree is None: self.set_error("E_QUERY_NOT_PARSED") return False self._error = None self._qtype = None with closing(BIN_Db.get_db()) as bin_db: state = { "session": self._session, "processor": _THIS_MODULE, "bin_db": bin_db, "query": self } # Process the first and only sentence within the tree self._tree.process_sentence(state, self._tree[1]) return self._error is None
def process(self, session, processor): """ Process a tree for an entire article """ # For each sentence in turn, do a depth-first traversal, # visiting each parent node after visiting its children # Initialize the running state that we keep between sentences article_begin = getattr(processor, "article_begin", None) if processor else None article_end = getattr(processor, "article_end", None) if processor else None with closing(BIN_Db.get_db()) as bin_db: state = { "session": session, "processor": processor, "bin_db": bin_db, "url": self.url, "authority": self.authority } # Call the article_begin(state) function, if it exists if article_begin is not None: article_begin(state) # Process the (parsed) sentences in the article for index, tree in self.s.items(): self.process_sentence(state, tree) # Call the article_end(state) function, if it exists if article_end is not None: article_end(state)
def add_composite(stofn, ordfl): """ Add composite word forms by putting a prefix on existing BIN word forms. Called from the config file handler. """ from bindb import BIN_Db assert stofn is not None assert ordfl is not None a = stofn.split("-") if len(a) != 2: raise ConfigError( "Composite word meaning must contain a single hyphen") with closing(BIN_Db.get_db()) as db: prefix = a[0] stem = a[1] m = db._forms(stem) if m: for w in m: if w.ordfl == ordfl: t = (prefix + w.stofn, 0, ordfl, w.fl, prefix + w.ordmynd, w.beyging) Meanings.DICT[prefix + w.ordmynd].append(t) Meanings.ROOT[prefix + w.stofn].append(t)
This module is written in Python 3 """ from settings import Settings, ConfigError from scraperdb import SessionContext, Person from bindb import BIN_Db try: # Read configuration file Settings.read("config/Reynir.conf") except ConfigError as e: print("Configuration error: {0}".format(e)) quit() with SessionContext(commit=True) as session, BIN_Db.get_db() as bdb: # Iterate through the persons q = session.query(Person) \ .filter((Person.gender == None) | (Person.gender == 'hk')) \ .order_by(Person.name) \ .yield_per(200) lastname = "" for p in q: p.gender = bdb.lookup_name_gender(p.name) if p.name != lastname: print("{0} {1}".format(p.gender, p.name)) lastname = p.name
total_tags - missing_tag_tnt, 100.0 * (total_tags - missing_tag_tnt) / total_tags)) print("Correct tags: {0:8} {1:6.2f}%".format( correct_tag_tnt, 100.0 * correct_tag_tnt / total_tags)) print("Partial tags: {0:8} {1:6.2f}%".format( partial_tag_tnt + correct_tag_tnt, 100.0 * (partial_tag_tnt + correct_tag_tnt) / total_tags)) print("Partial prec: {0:8} {1:6.2f}%".format( "", 100.0 * (partial_tag_tnt + correct_tag_tnt) / (total_tags - missing_tag_tnt))) print("Precision: {0:8} {1:6.2f}%".format( "", 100.0 * correct_tag_tnt / (total_tags - missing_tag_tnt))) print("\n-----------------------------------\n") if __name__ == "__main__": try: # Read configuration file Settings.read(os.path.join(basepath, "config/Reynir.conf")) except ConfigError as e: print("Configuration error: {0}".format(e)) quit() # This is always run as a main program try: with timeit("test_tagger()"): test_tagger() finally: BIN_Db.cleanup()
sentence_stream = Article.sentence_stream(limit = TRAINING_SET, skip = TEST_SET) word_tag_stream = IFD_Tagset.word_tag_stream(sentence_stream) tnt_tagger.train(word_tag_stream) with timeit(f"Train TnT tagger on IFD training set"): # Get a sentence stream from parsed articles # Number of sentences, size of training set sample_ratio = 50 word_tag_stream = IFD_Corpus().word_tag_stream(skip = lambda n: n % sample_ratio == 0) tnt_tagger.train(word_tag_stream) with timeit(f"Store TnT model trained on {tnt_tagger.count} sentences"): tnt_tagger.store(_TNT_MODEL_FILE) if __name__ == "__main__": print("Welcome to the Greynir POS tagging trainer\n") try: # Read configuration file Settings.read(os.path.join(basepath, "config", "Reynir.conf")) except ConfigError as e: print("Configuration error: {0}".format(e)) quit() # This is always run as a main program try: with timeit("Training session"): train_tagger() finally: BIN_Db.cleanup()
""" from settings import Settings, ConfigError from scraperdb import SessionContext, Person from bindb import BIN_Db try: # Read configuration file Settings.read("config/Reynir.conf") except ConfigError as e: print("Configuration error: {0}".format(e)) quit() with SessionContext(commit=True) as session: bdb = BIN_Db.get_db() # Iterate through the persons q = session.query(Person) \ .filter((Person.gender == None) | (Person.gender == 'hk')) \ .order_by(Person.name) \ .yield_per(200) lastname = "" for p in q: p.gender = bdb.lookup_name_gender(p.name) if p.name != lastname: print("{0} {1}".format(p.gender, p.name)) lastname = p.name
""" from settings import Settings, ConfigError from db import SessionContext from db.models import Person from bindb import BIN_Db try: # Read configuration file Settings.read("config/Reynir.conf") except ConfigError as e: print("Configuration error: {0}".format(e)) quit() with SessionContext(commit = True) as session, BIN_Db.get_db() as bdb: # Iterate through the persons q = session.query(Person) \ .filter((Person.gender == None) | (Person.gender == 'hk')) \ .order_by(Person.name) \ .yield_per(200) lastname = "" for p in q: p.gender = bdb.lookup_name_gender(p.name) if p.name != lastname: print("{0} {1}".format(p.gender, p.name)) lastname = p.name