示例#1
0
def make_wordlist(filepath):
    with open(filepath, 'r') as f:
        wordlist = Automaton()
        for idx, word in enumerate(set(Base().encode(t) for t in f.read().split())):
            wordlist.add_word(word, (idx, word))
            wordlist.make_automaton()
    return wordlist
示例#2
0
def test_add_concepts():
    data_path = prepare_data(
        path.join(path.dirname(__file__), "..", "data", "raw",
                  "vocabularies-tiny.zip"))

    dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"),
                            sep="\t").dropna(subset=["concept_name"])
    automaton = Automaton()
    automaton = add_concepts(
        automaton, zip(dataframe["concept_name"], dataframe["concept_id"]))

    automaton.make_automaton()

    assert len(tuple(automaton.keys())) == 15791

    first_keys = sorted(automaton.keys())[:10]
    assert first_keys == [
        '% REF', '(1-6)-alpha-glucomannan', '1 alpha-hydroxyergocalciferol',
        "1,1',1'',1'''-(ethylenedinitrilo)tetra-2-propanol",
        '1,1,1-trichloro-2,2,2-trifluoroethane', '1,1-difluoroethane',
        '1,10-decanediol', '1,10-phenanthroline', '1,2,6-hexanetriol',
        '1,2-Dipalmitoylphosphatidylcholine'
    ]

    first_concept_id, first_concept_name = automaton.get(first_keys[0])

    assert (first_concept_id, first_concept_name) == (8514, '% REF')
示例#3
0
def update_automaton(dataframe,
                     automaton_filename=path.join(PROCESSED_DATA_PATH,
                                                  "vocabulary_automaton.pkl")):
    # Assert we have the same amount of concept names and ids.
    assert len(dataframe["concept_name"] == dataframe["concept_id"])

    try:
        with open(automaton_filename, "rb") as automaton_file:
            automaton = pickle.load(automaton_file)

        logging.info("Loaded previous automaton from path '{}'.".format(
            automaton_filename))
    except FileNotFoundError:
        logging.info("Created new automaton.")
        automaton = Automaton()

    automaton = add_concepts(
        automaton, zip(dataframe["concept_name"], dataframe["concept_id"]))

    automaton.make_automaton()

    with open(automaton_filename, "wb") as automaton_file:
        pickle.dump(automaton, automaton_file)

    logging.info(
        "Updated automaton under path '{}'.".format(automaton_filename))
    return automaton
示例#4
0
	def __init__(
		self,
		bc_to_id: Dict[str, str],
		len_linker: int,
		len_primer: int,
		*,
		max_mm: int = 1,
		use_stats: bool = True
	):
		self.bc_to_id = bc_to_id
		self.len_linker = len_linker
		self.len_primer = len_primer
		self.stats = None if not use_stats else dict(
			n_only_primer=0,
			n_multiple_bcs=0,
			n_no_barcode=0,
			n_regular=0,
			n_barcode_mismatch=0,
			n_junk=0,
		)
		
		self.automaton = Automaton()
		all_barcodes, self.blacklist = get_all_barcodes(bc_to_id.keys(), max_mm=max_mm)
		for pattern, barcode in all_barcodes.items():
			self.automaton.add_word(pattern, barcode)
		self.automaton.make_automaton()
示例#5
0
 def build_automata(vocab):
     # Build Aho-Corasick matching automata for vocabulary items
     # grouped by length.
     from ahocorasick import Automaton
     start_time = datetime.now()
     info('start building automata at {}'.format(
         start_time.strftime("%H:%M:%S")))
     strings = list(vocab)
     max_len = max(len(s) for s in strings)
     strings.sort(key=lambda s: len(s))
     strings_by_len = defaultdict(list)
     for k, g in groupby(strings, lambda s: len(s)):
         strings_by_len[k] = list(g)
     automata_by_len = {}
     for i in range(1, max_len + 1):
         if i not in strings_by_len:
             continue
         a = Automaton()
         for s in strings_by_len[i]:
             a.add_word(s, i)
         a.make_automaton()
         automata_by_len[i] = a
     end_time = datetime.now()
     info('finish building automata at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return automata_by_len
示例#6
0
    def build_automaton(self):
        q = Entity.all()
        q = q.filter(Entity.schema.in_(self.TYPES.keys()))

        matches = {}
        for entity in q:
            tag = self.TYPES.get(entity.schema)
            if tag is None:
                continue
            for name in entity.names:
                if name is None or len(name) > 120:
                    continue
                match = self.match_form(name)
                if match is None:
                    continue
                if match in matches:
                    matches[match].append((name, tag))
                else:
                    matches[match] = [(name, tag)]

        if not len(matches):
            return

        automaton = Automaton()
        for term, entities in matches.iteritems():
            automaton.add_word(term, entities)
        automaton.make_automaton()
        return automaton
示例#7
0
    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                if term in matches:
                    matches[term].append(entity.id)
                else:
                    matches[term] = [entity.id]

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
示例#8
0
 def build_automata(vocab):
     # Build Aho-Corasick matching automata for vocabulary items
     # grouped by length. The wordpiece convention is inverted for
     # matching: continuations are unmarked (instead of "##") and
     # string start is marked by "^^".
     from ahocorasick import Automaton
     start_time = datetime.now()
     info('start building automata at {}'.format(
         start_time.strftime("%H:%M:%S")))
     strings = [v[2:] if v.startswith('##') else '^^' + v for v in vocab]
     max_len = max(len(s) for s in strings)
     strings.sort(key=lambda s: len(s))
     strings_by_len = defaultdict(list)
     for k, g in groupby(strings, lambda s: len(s)):
         strings_by_len[k] = list(g)
     automata_by_len = {}
     for i in range(1, max_len + 1):
         if i not in strings_by_len:
             continue
         a = Automaton()
         for s in strings_by_len[i]:
             a.add_word(s, i)
         a.make_automaton()
         automata_by_len[i] = a
     end_time = datetime.now()
     info('finish building automata at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return automata_by_len
示例#9
0
    def load_vocab(self, paths, add=False, separator='\t'):
        if add and hasattr(self, 'automaton'):
            pass
        else:
            self.automaton = Automaton()

        vocab = defaultdict(list)
        tags = set()
        if isinstance(paths, str):
            paths = [paths]
        for path in paths:
            name_space = os.path.split(path)[-1]
            print('read %s' % path)
            output = os.popen('wc -l ' + path)
            total = int(output.readline().split()[0])
            with open(path, 'r') as r_f:
                print('vocab file Examples:')
                for n, line in enumerate(r_f):
                    print(line.strip())
                    if n >= 10:
                        break
                r_f.seek(0)
                for line in tqdm(r_f, desc='read file', total=total):
                    if random.random() > self.keep_vocab_pro:
                        continue
                    splits = line.strip().split(separator)
                    try:
                        if len(splits) == 2:
                            word, tag = splits
                            value = True
                        elif len(splits) == 3:
                            word, tag, value = splits
                            value = char2num(value)

                        elif len(splits) == 1:
                            word = splits[0]
                            value = True
                            tag = 'WORD'

                        else:
                            continue

                        if self.vocab_name_space:
                            tag = name_space + '_' + tag
                        vocab[word].append((tag, value))
                        if tag not in tags:
                            tags.add(tag)

                    except Exception as e:
                        print('vocab error: path-%s, line %s' % (path, line),
                              e)
                        continue

        self.tags = tags if not hasattr(self, 'tags') else self.tags | tags

        for word, value in tqdm(vocab.items(), desc='add words'):
            self.automaton.add_word(word, (len(word), word, value))

        print('总共有%s个词' % len(vocab))
        self.automaton.make_automaton()
示例#10
0
 def _make_kwtree(keywords):
     if keywords:
         kwtree = Automaton()
         for keyword in keywords:
             kwtree.add_word(keyword, keyword)
         kwtree.make_automaton()
     else:
         kwtree = None
     return kwtree
示例#11
0
 def __init__(self, identifier_mapper, identifiers):
     self.identifier_mapper = identifier_mapper
     self.identifiers = identifiers
     self.automaton = Automaton()
     for identifier in identifiers:
         mapped = identifier_mapper(identifier)
         self.automaton.add_word(identifier, (len(identifier), mapped))
     self.automaton.make_automaton()
     self.dest_dirs = set()
示例#12
0
    def __init__(self, gaze_file=data_path):
        self.locations = {}
        self.vocab_to_location = {}
        self.automaton = Automaton()

        with open(gaze_file) as cin:
            self.load_gazes(cin)

        self.automaton.make_automaton()
示例#13
0
    def _get_keyword_processor(self, custom_vocab: List[str]):
        keyword_processor = Automaton()

        for i, keyword in enumerate(custom_vocab):
            if len(keyword) > 1:
                keyword_processor.add_word(keyword, (i, keyword))

        keyword_processor.make_automaton()
        return keyword_processor
示例#14
0
def initialize_ac_automaton(kmers: pd.DataFrame):

    A = Automaton()

    for idx, kmer in enumerate(set(kmers['kmer'])):
        A.add_word(kmer, (idx, kmer))

    A.make_automaton()

    return A
示例#15
0
def benchmark_pyahocorasick(LINE):
    from ahocorasick import Automaton, STORE_INTS

    automaton = Automaton()
    for i, key in enumerate(KEYS):
        automaton.add_word(key, key)
    automaton.make_automaton()

    print(list(automaton.iter(LINE)))

    benchmark("list(automaton.iter(LINE))", locals())
示例#16
0
 def build_automaton(vocab):
     # Build Aho-Corasick matching automaton for vocabulary items
     from ahocorasick import Automaton
     start_time = datetime.now()
     info('start building automaton at {}'.format(
         start_time.strftime("%H:%M:%S")))
     a = Automaton()
     for v in vocab:
         a.add_word(v, len(v))
     a.make_automaton()
     end_time = datetime.now()
     info('finish building automata at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return a
示例#17
0
def init_automaton(scheme_fasta):
    """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta

    Args:
        scheme_fasta: SNV scheme fasta file path

    Returns:
         Aho-Corasick Automaton with kmers loaded
    """
    A = Automaton()
    for header, sequence in parse_fasta(scheme_fasta):
        A.add_word(sequence, (header, sequence, False))
        A.add_word(revcomp(sequence), (header, sequence, True))
    A.make_automaton()
    return A
示例#18
0
def _get_automaton(normalizer):
    with compiler_lock:
        if normalizer in AUTOMATA:
            return AUTOMATA.get(normalizer)
        aho = Automaton()
        count = 0
        for place in iter_places():
            name = place.get('name')
            norm = normalizer(name)
            value = (place.get('code'), place.get('country'))
            aho.add_word(norm, value)
            count += 1
        log.debug("Country automaton: %d places", count)
        aho.make_automaton()
        AUTOMATA[normalizer] = aho
        return aho
示例#19
0
def init_automaton(scheme_fasta):
    """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta

    Args:
        scheme_fasta: SNV scheme fasta file path

    Returns:
         Aho-Corasick Automaton with kmers loaded
    """
    A = Automaton()
    for header, sequence in parse_fasta(scheme_fasta):
        kmer_list = expand_degenerate_bases(sequence)
        for seq in kmer_list:
            A.add_word(seq, (header, seq, False))
            A.add_word(revcomp(seq), (header, seq, True))
    A.make_automaton()
    return A
示例#20
0
def test_match_text():
    data_path = prepare_data(
        path.join(path.dirname(__file__), "..", "data", "raw",
                  "vocabularies-tiny.zip"))

    dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"),
                            sep="\t").dropna(subset=["concept_name"])
    automaton = Automaton()
    automaton = add_concepts(
        automaton, zip(dataframe["concept_name"], dataframe["concept_id"]))

    automaton.make_automaton()

    matches = list(generate_matches(automaton=automaton, text=dummy_abstract))
    match_soll_values = [(54, (46257025, 'ethyl acetate')),
                         (653, (45616149, 'formic acid')),
                         (785, (8512, 'day'))]
    assert matches == match_soll_values
示例#21
0
 def create_automaton(dict_dir, vocab_suffix, min_word_len=3):
     assert isinstance(min_word_len, int) or isinstance(min_word_len, dict)
     automaton = Automaton()
     if os.path.isdir(dict_dir):
         dicts_path = [
             os.path.join(dict_dir, i) for i in os.listdir(dict_dir)
             if i.endswith(vocab_suffix)
         ]
     else:
         dicts_path = [dict_dir]
     for path in dicts_path:
         tag = os.path.split(path)[-1].strip(vocab_suffix)
         vocab = set(readfile(path, deal_func=lambda x: x.strip()))
         tag_min_word_len = min_word_len if isinstance(
             min_word_len, int) else min_word_len[tag]
         for word in vocab:
             word_len = len(word)
             if word_len >= tag_min_word_len:
                 automaton.add_word(word, (word_len, word, tag))
     automaton.make_automaton()
     return automaton
示例#22
0
    def __create_automaton(self):
        paths = [
            ('Brand', os.path.join(Path.dictionary, 'Brand.txt')),
            ('Car', os.path.join(Path.dictionary, 'Car.txt')),
            ('Train', os.path.join(Path.dictionary, 'Train.txt')),
            ('Predicate', os.path.join(Path.dictionary, 'config.txt'))
        ]
        automaton = Automaton()
        for tag, path in paths:
            with open(path, 'r') as r_f:
                for line in r_f:
                    line = line.rstrip('\n')
                    _, *words = line.split('\t')
                    for word in words:
                        word = re.sub('\(.*?\)', '', word.lower())
                        _, tag_set = automaton.get(word, (word, set()))
                        tag_set.add(tag)
                        automaton.add_word(word, (word, tag_set))

        automaton.make_automaton()
        return automaton
示例#23
0
    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = defaultdict(set)
        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                matches[term].add(entity.id)

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.items():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
 async def _update_links_automaton(self):
     """
     Fetch the latest version of the links from the table, build an automaton.
     """
     logger.info(
         "_update_links_automaton: fetching links from table %s",
         self._links_table,
     )
     try:
         links = await self._api.run_db_interaction(
             "Fetch links from the table", _db_fetch_links,
             self._links_table)
         logger.info("_update_links_automaton: we received %d links",
                     len(links))
         new_link_automaton = Automaton(ahocorasick.STORE_LENGTH)
         for link in links:
             new_link_automaton.add_word(link)
         await make_deferred_yieldable(
             deferToThread(new_link_automaton.make_automaton))
         self._link_automaton = new_link_automaton
     except Exception as e:
         logger.exception("_update_links_automaton: could not update")
         raise e
示例#25
0
def default_phrase_mapper():
    """Return a phrase mapper initialized with the default phrases."""
    return PhraseMapper(Automaton(), DEFAULT_PHRASES).setup()
示例#26
0
def location_phrase_mapper():
    """Return a phrase mapper initialized with the location phrases."""
    return PhraseMapper(Automaton(), LOCATION_PHRASES).setup()
示例#27
0
from ahocorasick import Automaton
from pickle import load, dump

auto = Automaton()
auto.add_word('abc', 'abc')

auto.add_word('def', 'def')

with open('automaton-wee.pickle', 'wb') as dest:
    dump(auto, dest)
示例#28
0
import threading
from ahocorasick import Automaton

# python dictionary with trie patterns
file_words = json.load(open("../static_ioc_sample_30k.txt", "r"))
words_to_search = list()
trie_words = list()

total_words_to_search = 1000
total_words_added = 0

t = list()
patterns = dict()
total_initial_words = 0
total_iterations = 10  # CHANGE the number of iterations to perform: +/- 30k patterns per iteration.
A = Automaton()
for x in range(0, total_iterations):
    print("In iteration ", x)
    for key in file_words:
        for value in file_words[key]:
            value_random = value + str(random.randint(10000, 500000))
            if total_words_to_search != total_words_added:
                words_to_search.append(value)
                total_words_added += 1
            if x == 0:
                total_initial_words += 1
            A.add_word(value_random, value)

print(f"Initial words {total_initial_words}")
print(f"Total patterns on AC trie: {total_initial_words*total_iterations+1}")
A.make_automaton()
示例#29
0
    def __init__(self, *args, **kwargs):
        super(AhoCorasick, self).__init__(*args, **kwargs)

        # Use the Aho-Corasick search algorithm to speed up phrase lookups
        self.automaton = Automaton()
示例#30
0
 def __init__(self):
     self.latest = None
     self.automaton = Automaton()
     self.matches = {}