def make_wordlist(filepath): with open(filepath, 'r') as f: wordlist = Automaton() for idx, word in enumerate(set(Base().encode(t) for t in f.read().split())): wordlist.add_word(word, (idx, word)) wordlist.make_automaton() return wordlist
def test_add_concepts(): data_path = prepare_data( path.join(path.dirname(__file__), "..", "data", "raw", "vocabularies-tiny.zip")) dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"), sep="\t").dropna(subset=["concept_name"]) automaton = Automaton() automaton = add_concepts( automaton, zip(dataframe["concept_name"], dataframe["concept_id"])) automaton.make_automaton() assert len(tuple(automaton.keys())) == 15791 first_keys = sorted(automaton.keys())[:10] assert first_keys == [ '% REF', '(1-6)-alpha-glucomannan', '1 alpha-hydroxyergocalciferol', "1,1',1'',1'''-(ethylenedinitrilo)tetra-2-propanol", '1,1,1-trichloro-2,2,2-trifluoroethane', '1,1-difluoroethane', '1,10-decanediol', '1,10-phenanthroline', '1,2,6-hexanetriol', '1,2-Dipalmitoylphosphatidylcholine' ] first_concept_id, first_concept_name = automaton.get(first_keys[0]) assert (first_concept_id, first_concept_name) == (8514, '% REF')
def update_automaton(dataframe, automaton_filename=path.join(PROCESSED_DATA_PATH, "vocabulary_automaton.pkl")): # Assert we have the same amount of concept names and ids. assert len(dataframe["concept_name"] == dataframe["concept_id"]) try: with open(automaton_filename, "rb") as automaton_file: automaton = pickle.load(automaton_file) logging.info("Loaded previous automaton from path '{}'.".format( automaton_filename)) except FileNotFoundError: logging.info("Created new automaton.") automaton = Automaton() automaton = add_concepts( automaton, zip(dataframe["concept_name"], dataframe["concept_id"])) automaton.make_automaton() with open(automaton_filename, "wb") as automaton_file: pickle.dump(automaton, automaton_file) logging.info( "Updated automaton under path '{}'.".format(automaton_filename)) return automaton
def __init__( self, bc_to_id: Dict[str, str], len_linker: int, len_primer: int, *, max_mm: int = 1, use_stats: bool = True ): self.bc_to_id = bc_to_id self.len_linker = len_linker self.len_primer = len_primer self.stats = None if not use_stats else dict( n_only_primer=0, n_multiple_bcs=0, n_no_barcode=0, n_regular=0, n_barcode_mismatch=0, n_junk=0, ) self.automaton = Automaton() all_barcodes, self.blacklist = get_all_barcodes(bc_to_id.keys(), max_mm=max_mm) for pattern, barcode in all_barcodes.items(): self.automaton.add_word(pattern, barcode) self.automaton.make_automaton()
def build_automata(vocab): # Build Aho-Corasick matching automata for vocabulary items # grouped by length. from ahocorasick import Automaton start_time = datetime.now() info('start building automata at {}'.format( start_time.strftime("%H:%M:%S"))) strings = list(vocab) max_len = max(len(s) for s in strings) strings.sort(key=lambda s: len(s)) strings_by_len = defaultdict(list) for k, g in groupby(strings, lambda s: len(s)): strings_by_len[k] = list(g) automata_by_len = {} for i in range(1, max_len + 1): if i not in strings_by_len: continue a = Automaton() for s in strings_by_len[i]: a.add_word(s, i) a.make_automaton() automata_by_len[i] = a end_time = datetime.now() info('finish building automata at {} (delta {})'.format( end_time.strftime("%H:%M:%S"), end_time - start_time)) return automata_by_len
def build_automaton(self): q = Entity.all() q = q.filter(Entity.schema.in_(self.TYPES.keys())) matches = {} for entity in q: tag = self.TYPES.get(entity.schema) if tag is None: continue for name in entity.names: if name is None or len(name) > 120: continue match = self.match_form(name) if match is None: continue if match in matches: matches[match].append((name, tag)) else: matches[match] = [(name, tag)] if not len(matches): return automaton = Automaton() for term, entities in matches.iteritems(): automaton.add_word(term, entities) automaton.make_automaton() return automaton
def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: if term in matches: matches[term].append(entity.id) else: matches[term] = [entity.id] if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def build_automata(vocab): # Build Aho-Corasick matching automata for vocabulary items # grouped by length. The wordpiece convention is inverted for # matching: continuations are unmarked (instead of "##") and # string start is marked by "^^". from ahocorasick import Automaton start_time = datetime.now() info('start building automata at {}'.format( start_time.strftime("%H:%M:%S"))) strings = [v[2:] if v.startswith('##') else '^^' + v for v in vocab] max_len = max(len(s) for s in strings) strings.sort(key=lambda s: len(s)) strings_by_len = defaultdict(list) for k, g in groupby(strings, lambda s: len(s)): strings_by_len[k] = list(g) automata_by_len = {} for i in range(1, max_len + 1): if i not in strings_by_len: continue a = Automaton() for s in strings_by_len[i]: a.add_word(s, i) a.make_automaton() automata_by_len[i] = a end_time = datetime.now() info('finish building automata at {} (delta {})'.format( end_time.strftime("%H:%M:%S"), end_time - start_time)) return automata_by_len
def load_vocab(self, paths, add=False, separator='\t'): if add and hasattr(self, 'automaton'): pass else: self.automaton = Automaton() vocab = defaultdict(list) tags = set() if isinstance(paths, str): paths = [paths] for path in paths: name_space = os.path.split(path)[-1] print('read %s' % path) output = os.popen('wc -l ' + path) total = int(output.readline().split()[0]) with open(path, 'r') as r_f: print('vocab file Examples:') for n, line in enumerate(r_f): print(line.strip()) if n >= 10: break r_f.seek(0) for line in tqdm(r_f, desc='read file', total=total): if random.random() > self.keep_vocab_pro: continue splits = line.strip().split(separator) try: if len(splits) == 2: word, tag = splits value = True elif len(splits) == 3: word, tag, value = splits value = char2num(value) elif len(splits) == 1: word = splits[0] value = True tag = 'WORD' else: continue if self.vocab_name_space: tag = name_space + '_' + tag vocab[word].append((tag, value)) if tag not in tags: tags.add(tag) except Exception as e: print('vocab error: path-%s, line %s' % (path, line), e) continue self.tags = tags if not hasattr(self, 'tags') else self.tags | tags for word, value in tqdm(vocab.items(), desc='add words'): self.automaton.add_word(word, (len(word), word, value)) print('总共有%s个词' % len(vocab)) self.automaton.make_automaton()
def _make_kwtree(keywords): if keywords: kwtree = Automaton() for keyword in keywords: kwtree.add_word(keyword, keyword) kwtree.make_automaton() else: kwtree = None return kwtree
def __init__(self, identifier_mapper, identifiers): self.identifier_mapper = identifier_mapper self.identifiers = identifiers self.automaton = Automaton() for identifier in identifiers: mapped = identifier_mapper(identifier) self.automaton.add_word(identifier, (len(identifier), mapped)) self.automaton.make_automaton() self.dest_dirs = set()
def __init__(self, gaze_file=data_path): self.locations = {} self.vocab_to_location = {} self.automaton = Automaton() with open(gaze_file) as cin: self.load_gazes(cin) self.automaton.make_automaton()
def _get_keyword_processor(self, custom_vocab: List[str]): keyword_processor = Automaton() for i, keyword in enumerate(custom_vocab): if len(keyword) > 1: keyword_processor.add_word(keyword, (i, keyword)) keyword_processor.make_automaton() return keyword_processor
def initialize_ac_automaton(kmers: pd.DataFrame): A = Automaton() for idx, kmer in enumerate(set(kmers['kmer'])): A.add_word(kmer, (idx, kmer)) A.make_automaton() return A
def benchmark_pyahocorasick(LINE): from ahocorasick import Automaton, STORE_INTS automaton = Automaton() for i, key in enumerate(KEYS): automaton.add_word(key, key) automaton.make_automaton() print(list(automaton.iter(LINE))) benchmark("list(automaton.iter(LINE))", locals())
def build_automaton(vocab): # Build Aho-Corasick matching automaton for vocabulary items from ahocorasick import Automaton start_time = datetime.now() info('start building automaton at {}'.format( start_time.strftime("%H:%M:%S"))) a = Automaton() for v in vocab: a.add_word(v, len(v)) a.make_automaton() end_time = datetime.now() info('finish building automata at {} (delta {})'.format( end_time.strftime("%H:%M:%S"), end_time - start_time)) return a
def init_automaton(scheme_fasta): """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta Args: scheme_fasta: SNV scheme fasta file path Returns: Aho-Corasick Automaton with kmers loaded """ A = Automaton() for header, sequence in parse_fasta(scheme_fasta): A.add_word(sequence, (header, sequence, False)) A.add_word(revcomp(sequence), (header, sequence, True)) A.make_automaton() return A
def _get_automaton(normalizer): with compiler_lock: if normalizer in AUTOMATA: return AUTOMATA.get(normalizer) aho = Automaton() count = 0 for place in iter_places(): name = place.get('name') norm = normalizer(name) value = (place.get('code'), place.get('country')) aho.add_word(norm, value) count += 1 log.debug("Country automaton: %d places", count) aho.make_automaton() AUTOMATA[normalizer] = aho return aho
def init_automaton(scheme_fasta): """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta Args: scheme_fasta: SNV scheme fasta file path Returns: Aho-Corasick Automaton with kmers loaded """ A = Automaton() for header, sequence in parse_fasta(scheme_fasta): kmer_list = expand_degenerate_bases(sequence) for seq in kmer_list: A.add_word(seq, (header, seq, False)) A.add_word(revcomp(seq), (header, seq, True)) A.make_automaton() return A
def test_match_text(): data_path = prepare_data( path.join(path.dirname(__file__), "..", "data", "raw", "vocabularies-tiny.zip")) dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"), sep="\t").dropna(subset=["concept_name"]) automaton = Automaton() automaton = add_concepts( automaton, zip(dataframe["concept_name"], dataframe["concept_id"])) automaton.make_automaton() matches = list(generate_matches(automaton=automaton, text=dummy_abstract)) match_soll_values = [(54, (46257025, 'ethyl acetate')), (653, (45616149, 'formic acid')), (785, (8512, 'day'))] assert matches == match_soll_values
def create_automaton(dict_dir, vocab_suffix, min_word_len=3): assert isinstance(min_word_len, int) or isinstance(min_word_len, dict) automaton = Automaton() if os.path.isdir(dict_dir): dicts_path = [ os.path.join(dict_dir, i) for i in os.listdir(dict_dir) if i.endswith(vocab_suffix) ] else: dicts_path = [dict_dir] for path in dicts_path: tag = os.path.split(path)[-1].strip(vocab_suffix) vocab = set(readfile(path, deal_func=lambda x: x.strip())) tag_min_word_len = min_word_len if isinstance( min_word_len, int) else min_word_len[tag] for word in vocab: word_len = len(word) if word_len >= tag_min_word_len: automaton.add_word(word, (word_len, word, tag)) automaton.make_automaton() return automaton
def __create_automaton(self): paths = [ ('Brand', os.path.join(Path.dictionary, 'Brand.txt')), ('Car', os.path.join(Path.dictionary, 'Car.txt')), ('Train', os.path.join(Path.dictionary, 'Train.txt')), ('Predicate', os.path.join(Path.dictionary, 'config.txt')) ] automaton = Automaton() for tag, path in paths: with open(path, 'r') as r_f: for line in r_f: line = line.rstrip('\n') _, *words = line.split('\t') for word in words: word = re.sub('\(.*?\)', '', word.lower()) _, tag_set = automaton.get(word, (word, set())) tag_set.add(tag) automaton.add_word(word, (word, tag_set)) automaton.make_automaton() return automaton
def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest matches = defaultdict(set) q = Entity.all() q = q.options(joinedload('other_names')) q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: matches[term].add(entity.id) if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.items(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
async def _update_links_automaton(self): """ Fetch the latest version of the links from the table, build an automaton. """ logger.info( "_update_links_automaton: fetching links from table %s", self._links_table, ) try: links = await self._api.run_db_interaction( "Fetch links from the table", _db_fetch_links, self._links_table) logger.info("_update_links_automaton: we received %d links", len(links)) new_link_automaton = Automaton(ahocorasick.STORE_LENGTH) for link in links: new_link_automaton.add_word(link) await make_deferred_yieldable( deferToThread(new_link_automaton.make_automaton)) self._link_automaton = new_link_automaton except Exception as e: logger.exception("_update_links_automaton: could not update") raise e
def default_phrase_mapper(): """Return a phrase mapper initialized with the default phrases.""" return PhraseMapper(Automaton(), DEFAULT_PHRASES).setup()
def location_phrase_mapper(): """Return a phrase mapper initialized with the location phrases.""" return PhraseMapper(Automaton(), LOCATION_PHRASES).setup()
from ahocorasick import Automaton from pickle import load, dump auto = Automaton() auto.add_word('abc', 'abc') auto.add_word('def', 'def') with open('automaton-wee.pickle', 'wb') as dest: dump(auto, dest)
import threading from ahocorasick import Automaton # python dictionary with trie patterns file_words = json.load(open("../static_ioc_sample_30k.txt", "r")) words_to_search = list() trie_words = list() total_words_to_search = 1000 total_words_added = 0 t = list() patterns = dict() total_initial_words = 0 total_iterations = 10 # CHANGE the number of iterations to perform: +/- 30k patterns per iteration. A = Automaton() for x in range(0, total_iterations): print("In iteration ", x) for key in file_words: for value in file_words[key]: value_random = value + str(random.randint(10000, 500000)) if total_words_to_search != total_words_added: words_to_search.append(value) total_words_added += 1 if x == 0: total_initial_words += 1 A.add_word(value_random, value) print(f"Initial words {total_initial_words}") print(f"Total patterns on AC trie: {total_initial_words*total_iterations+1}") A.make_automaton()
def __init__(self, *args, **kwargs): super(AhoCorasick, self).__init__(*args, **kwargs) # Use the Aho-Corasick search algorithm to speed up phrase lookups self.automaton = Automaton()
def __init__(self): self.latest = None self.automaton = Automaton() self.matches = {}