def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: if term in matches: matches[term].append(entity.id) else: matches[term] = [entity.id] if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest self.matches = defaultdict(set) q = Entity.all() q = q.options(joinedload('other_names')) q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: self.matches[normalize_strong(term)].add(entity.id) self.regexes = [] terms = self.matches.keys() terms = [t for t in terms if len(t) > 2] for i in count(0): terms_slice = terms[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] if not len(terms_slice): break body = '|'.join(terms_slice) rex = re.compile('( |^)(%s)( |$)' % body) # rex = re.compile('(%s)' % body) self.regexes.append(rex) log.info('Generating entity tagger: %r (%s terms)', latest, len(terms))
def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() for entity in q: tag = self.TYPES.get(entity.schema) if tag is None: continue for term in entity.regex_terms: if term in matches: matches[term].append((entity.name, tag)) else: matches[term] = [(entity.name, tag)] if not len(matches): return for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() for entity in q: tag = self.TYPES.get(entity.schema) if tag is None: continue for name in entity.names: if name is None or len(name) > 120: continue match = match_form(name) # TODO: this is a weird heuristic, but to avoid overly # aggressive matching it may make sense: if match is None or ' ' not in match: continue if match in matches: matches[match].append((name, tag)) else: matches[match] = [(name, tag)] if not len(matches): return for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest matches = defaultdict(set) q = Entity.all() q = q.options(joinedload('other_names')) q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: matches[term].add(entity.id) if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.items(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))