def kmer_info(A: ahocorasick.Automaton, fastq: str) -> pd.DataFrame: """ Finds k-mers in the input fastq files :param Automaton: Ahocorasick automaton with all the k-mers loaded in it :param fastq: filepath for the input fastq file :return: k-mer frequency at SNP positions found in test fastq """ kmer_seq_counts = defaultdict(int) kmer_df = pd.DataFrame(columns=['POS', 'kmer_seq', 'freq']) for _, sequence in fp.parse_fastq(fastq): for idx, (_, kmer_seq, _) in A.iter(sequence): kmer_seq_counts[kmer_seq] += 1 res = [] for kmer_seq, freq in kmer_seq_counts.items(): kmername, sequence, _ = A.get(kmer_seq) res.append((kmername, kmer_seq, freq)) def f_out(val, index): return tuple(i[index] for i in val) tup1 = f_out(res, 0) tup2 = f_out(res, 1) tup3 = f_out(res, 2) for x in range(len(res)): kmer_df = kmer_df.append( { 'POS': tup1[x], 'kmer_seq': tup2[x], 'freq': tup3[x] }, ignore_index=True) return kmer_df
def match(AC: ahocorasick.Automaton, tokens: List[str]) -> List[Tuple[str, int, int, Set[str]]]: """ :param AC: the finalized Aho-Corasick automation. :param tokens: the list of input tokens. :return: a list of tuples where each tuple consists of - span: str, - start token index (inclusive): int - end token index (exclusive): int - a set of values for the span: Set[str] """ smap, emap, idx = dict(), dict(), 0 for i, token in enumerate(tokens): smap[idx] = i idx += len(token) emap[idx] = i idx += 1 # find matches text = ' '.join(tokens) spans = [] for eidx, t in AC.iter(text): eidx += 1 sidx = eidx - len(t.span) sidx = smap.get(sidx, None) eidx = emap.get(eidx, None) if sidx is None or eidx is None: continue spans.append((t.span, sidx, eidx + 1, t.values)) return spans
def extract_tokenized(line: str, wn: Type[ExtractableWordnet], auto: Automaton, id: str) -> TokenizedTagging: tagging = TokenizedTagging(wn) tokens = line.split(" ") starts = list(get_tokens_starts(tokens)) extract_tokenized_iter(tagging, auto.iter(tokens), wn, tokens, starts, id) return tagging
def benchmark_pyahocorasick(LINE): from ahocorasick import Automaton, STORE_INTS automaton = Automaton() for i, key in enumerate(KEYS): automaton.add_word(key, key) automaton.make_automaton() print(list(automaton.iter(LINE))) benchmark("list(automaton.iter(LINE))", locals())
def extract_auto(line: str, wn: Type[ExtractableWordnet], auto: Automaton, from_id: str) -> UntokenizedTagging: tagging = UntokenizedTagging(wn) for tok_idx, (end_pos, (token, wn_to_lemma)) in enumerate(auto.iter(line)): groups = wn.synset_group_lemmas(objify_lemmas(wn_to_lemma)) tags = [] for group in groups: tag_group = TaggedLemma(token) tag_group.lemma_objs = group tags.append(tag_group) tagging.add_tags(token, [Anchor(from_id, end_pos - len(token) + 1)], tags) return tagging
def match_kmers_to_reads(A: Automaton, *reads_paths) -> Dict[str, int]: kmer_counts = {} for reads in reads_paths: for sequence in yield_reads(reads): for _, (_, kmer) in A.iter(sequence): try: kmer_counts[kmer] += 1 except KeyError: kmer_counts[kmer] = 1 return pd.DataFrame(pd.Series(kmer_counts, name='count', dtype=int))
def find_in_fasta(A: Automaton, fasta: str) -> pd.DataFrame: """Find scheme kmers in input fasta file Args: A: Aho-Corasick Automaton with scheme SNV target kmers loaded fasta: Input fasta path Returns: Dataframe with any matches found in input fasta file """ res = [] for contig_header, sequence in parse_fasta(fasta): for idx, (kmername, kmer_seq, is_revcomp) in A.iter(sequence): res.append((kmername, kmer_seq, is_revcomp, contig_header, idx)) df = pd.DataFrame( res, columns=['kmername', 'seq', 'is_revcomp', 'contig_id', 'match_index']) return df
def tag_with_dict(company_trie: Automaton, sents: list, duplicate=None) -> float: sent_tags = [] sent_text = [] for sent in sents: text = ''.join(sent).strip() text = unicodedata.normalize('NFKC', text) chunks = [] tags = ['O'] * len(text) # find all chunks for idx, (_, w) in company_trie.iter(text): end_idx = idx + 1 start_idx = end_idx - len(w) chunks.append( [start_idx, end_idx, w] ) # [[48, 53, '愛知学泉大'], [122, 130, 'シャンソン化粧品'], [131, 135, 'ジャパン'], [131, 139, 'ジャパンエナジー'], [133, 134, 'パ'], [140, 144, '第一勧銀']] # find chunks if len(chunks) != 0: # filter chunks chunks = filter_chunks( chunks ) # [[122, 130, 'シャンソン化粧品'], [131, 139, 'ジャパンエナジー'], [140, 144, '第一勧銀']] # generate labels for chunk in chunks: start_idx, end_idx = chunk[0], chunk[1] if duplicate: if chunk[ 2] in duplicate: # if 'シャンソン化粧品' is in the duplicate names that show more than once in dataset for tag_idx in range(start_idx, end_idx): if tag_idx == start_idx: tags[tag_idx] = 'B-company' else: tags[tag_idx] = 'I-company' else: for tag_idx in range(start_idx, end_idx): if tag_idx == start_idx: tags[tag_idx] = 'B-company' else: tags[tag_idx] = 'I-company' sent_tags.append(tags) sent_text.append([x for x in text]) return sent_tags, sent_text
class AhoCorasickPathGenerator: def __init__(self, identifier_mapper, identifiers): self.identifier_mapper = identifier_mapper self.identifiers = identifiers self.automaton = Automaton() for identifier in identifiers: mapped = identifier_mapper(identifier) self.automaton.add_word(identifier, (len(identifier), mapped)) self.automaton.make_automaton() self.dest_dirs = set() def blind_path(self, path): out = '' idx = 0 for end_position, (length, mapped) in self.automaton.iter(path): end_idx = end_position + 1 start_idx = end_idx - length out += path[idx:start_idx] + mapped idx = end_idx out += path[idx:] return out def __call__(self, input_dir, output_dir): for root, dirs, files in os.walk(input_dir): for name in files: source_file_name = os.path.join(root, name) relpath = os.path.relpath( source_file_name, start=input_dir, ) dest_file_name = output_dir / self.blind_path(relpath) self.dest_dirs.add(abspath(dest_file_name.parent)) yield ( abspath(source_file_name), abspath(dest_file_name), ) @property def init_lines(self): return "\n".join(f'mkdir -p "{dest_dir}"' for dest_dir in self.dest_dirs) + "\n"
def find_in_fastqs(A: Automaton, *fastqs): """Find scheme kmers in input fastq files Args: A: Aho-Corasick Automaton with scheme SNV target kmers loaded fastqs: Input fastq file paths Returns: Dataframe with any matches found in input fastq files """ kmer_seq_counts = defaultdict(int) for fastq in fastqs: for _, sequence in parse_fastq(fastq): for idx, (_, kmer_seq, _) in A.iter(sequence): kmer_seq_counts[kmer_seq] += 1 res = [] for kmer_seq, freq in kmer_seq_counts.items(): kmername, sequence, _ = A.get(kmer_seq) res.append((kmername, kmer_seq, freq)) df = pd.DataFrame(res, columns=['kmername', 'seq', 'freq']) return df
class Gazetteer: def __init__(self, gaze_file=data_path): self.locations = {} self.vocab_to_location = {} self.automaton = Automaton() with open(gaze_file) as cin: self.load_gazes(cin) self.automaton.make_automaton() def load_gazes(self, cin): for line in cin: line = line.split('\t') line[-1] = line[-1].rstrip() self.locations[line[0]] = tuple(line) for vocab in line[3:]: if vocab in self.vocab_to_location: self.vocab_to_location[vocab].append(line[0]) else: self.vocab_to_location[vocab] = [line[0]] for vocab, value in self.vocab_to_location.items(): self.automaton.add_word(vocab, tuple(value)) def match(self, string): ret = {} for end_index, value in self.automaton.iter(string): for lid in value: if lid in ret: ret[lid] = (ret[lid][0], ret[lid][1] + 1) else: ret[lid] = (self.locations[lid], 1) return ret
from ahocorasick import Automaton auto = Automaton() auto.add_word('wounded', 'wounded') auto.make_automaton() for item in auto.iter('Winning \U0001F629 so gutted, can\'t do anything for 4 weeks... Myth. #wounded'): print(item) for item in auto.iter('Winning so gutted, can\'t do anything for 4 weeks... Myth. #wounded'): print(item)
class TrieTree: ''' 前缀树类,用于匹配词典 Parameters ---------- paths:一个或者一组字典文件名(str or list),文件格式要求每列用制表符隔开: 第一列为词, 第二列为词对应的信息, 第三列为信息附带的数值等,没有则默认为True 如: 中国 LOC 0.8 美国 国家 tp:为匹配类型,可选"c, m, mc",默认"mc", 分别对应: c: "BIES + _ + 词" m: "BIES + _" mc: "BIES + _","BIES + _ + 词" Return ------ defaultdict(in, {idx_0:{feature: value}, idx_1:...}) 返回一个以词id对应特征字典的特征集合 Examples -------- >>> trietree_c = TrieTree(paths=your_vocab_files, tp='c') >>> trietree_c("中国是一个国家") defaultdict(in, {0: {'B_LOC': True}, 1: {'E_LOC': True}}) >>> trietree_m = TrieTree(paths=your_vocab_files, tp='m') >>> trietree_m("中国是一个国家") defaultdict(in, {0: {'B': True}, 1: {'E': True}}) >>> trietree_mc = TrieTree(paths=your_vocab_files, tp='mc') >>> trietree_mc("中国是一个国家") defaultdict(in, {0: {'B': True, 'B_LOC': True}, 1: {'E': True, 'E_LOC': True}}) ''' def __init__(self, vocab_paths, vocab_match_type='mc', drop_vocab_pro=0, vocab_name_space=False, separator='\t'): self.match_cnt = Counter() self.user_automaton = {} self.keep_vocab_pro = 1 - drop_vocab_pro self.vocab_name_space = vocab_name_space self.vmp = vocab_match_type self.load_vocab(vocab_paths, separator=separator) self.cnt = Counter() print('trietree:\ntp: %s\n, vocab path:%s' % (self.vmp, str(vocab_paths))) if self.keep_vocab_pro < 1: print('drop vocab pro', self.keep_vocab_pro) def __call__(self, *args, **kwargs): vocab_feature = self._vocab_feature(*args, **kwargs) return vocab_feature def load_vocab(self, paths, add=False, separator='\t'): if add and hasattr(self, 'automaton'): pass else: self.automaton = Automaton() vocab = defaultdict(list) tags = set() if isinstance(paths, str): paths = [paths] for path in paths: name_space = os.path.split(path)[-1] print('read %s' % path) output = os.popen('wc -l ' + path) total = int(output.readline().split()[0]) with open(path, 'r') as r_f: print('vocab file Examples:') for n, line in enumerate(r_f): print(line.strip()) if n >= 10: break r_f.seek(0) for line in tqdm(r_f, desc='read file', total=total): if random.random() > self.keep_vocab_pro: continue splits = line.strip().split(separator) try: if len(splits) == 2: word, tag = splits value = True elif len(splits) == 3: word, tag, value = splits value = char2num(value) elif len(splits) == 1: word = splits[0] value = True tag = 'WORD' else: continue if self.vocab_name_space: tag = name_space + '_' + tag vocab[word].append((tag, value)) if tag not in tags: tags.add(tag) except Exception as e: print('vocab error: path-%s, line %s' % (path, line), e) continue self.tags = tags if not hasattr(self, 'tags') else self.tags | tags for word, value in tqdm(vocab.items(), desc='add words'): self.automaton.add_word(word, (len(word), word, value)) print('总共有%s个词' % len(vocab)) self.automaton.make_automaton() def _vocab_feature(self, sentence): vocab_feature = defaultdict(dict) self.match(sentence, vocab_feature) if self.user_automaton: self.match(sentence, vocab_feature, base_or_user='******') return vocab_feature def match(self, sentence, vocab_feature, base_or_user='******'): if base_or_user == 'base': result = self.automaton.iter(sentence) else: result = self.user_automaton.iter(sentence) for end_idx, (word_len, _, tag_value) in list(result): start_idx = end_idx - word_len + 1 for tag, value in tag_value: self.match_cnt[tag] += 1 if self.vmp == 'c': tagss = [create_tag(word_len, tag)] elif self.vmp == 'm': tagss = [create_tag(word_len, '')] elif self.vmp == 'mc': tagss = [ create_tag(word_len, tag), create_tag(word_len, '') ] else: tagss = [] for tags in tagss: for idx, tag in zip(range(start_idx, end_idx + 1), tags): vocab_feature[idx][tag] = value def init_user_automaton(self): self.user_automaton = Automaton() self.user_automaton.make_automaton() def add_word(self, word, tag, value, update=True): ''' Parameters ---------- word: 匹配的词 tag: 词对应的信息 value: 信息附带的数值 Examples -------- >>> trietree.add_word('中国', '国家', True) >>> trietree.user_automaton.get('中国') (2, '中国', [('LOC', True)]) ''' have_add = '' if self.user_automaton == {}: self.init_user_automaton() wl, w, tag_values = self.user_automaton.get(word, (len(word), word, [])) for i, (t, v) in enumerate(tag_values): if t == tag: tag_values[i] = (tag, value) break else: tag_values.append((tag, value)) self.user_automaton.add_word(w, (wl, w, tag_values)) if update: self.user_automaton.make_automaton() def add_words(self, word_tag_values): ''' do: for word, tag, value in word_tag_values: self.add_word(word, tag, value, update=False) Examples -------- word_tag_values = [('中国', '面积', 9666), ('中国', '人口', 8888)] >>> trietree.add_word('中国', '国家', True) >>> trietree.user_automaton.get('中国') (2, '中国', [('面积', 9666), ('人口', 8888)]) ''' for word, tag, value in word_tag_values: self.add_word(word, tag, value, update=False) self.user_automaton.make_automaton() def get(self, key, default=None, vocab='all'): ''' 与字典get方法一样 Parameters ---------- vocab: 用于选择基本词典或者用户自定义词典,base(基本)/user(用户自定义)/all(两个),默认为all ''' if vocab == 'base': value = self.automaton.get(key, default) elif vocab == 'user': value = self.user_automaton.get(key, default) else: value = { 'base': self.automaton.get(key, default), 'user': self.user_automaton.get(key, default) } return value
def extract_features( doc: str, ngram: int = DEFAULT_NGRAM, dict_automaton: ahocorasick.Automaton = DICT_AUTOMATON, ) -> List[List]: len_doc = len(doc) look_range = list(range(1, int(ngram / 2) + 1)) # Get (start, end) candidates from dictionary dict_start_boundaries = set() dict_end_boundaries = set() for end_index, length in dict_automaton.iter(doc): start_index = end_index - length + 1 dict_start_boundaries.add(start_index) dict_end_boundaries.add(end_index) doc_features = [] for i, char in enumerate(doc): ct = get_chartype(char) char_features = ["bias", "t={}".format(ct)] if ct not in GENERIC_CHARTYPES: if char == "\n": char = "EOL" char_features.append("c={}".format(char)) if i == 0: char_features.append("BOS") # Beginning of string elif i == len_doc - 1: char_features.append("EOS") # End of string # Look backward for j in look_range: if i >= j: c = doc[i - j] ct = get_chartype(c) char_features.append("t-{}={}".format(j, ct)) if ct not in GENERIC_CHARTYPES: if char == "\n": char = "EOL" char_features.append("c-{}={}".format(j, c)) else: break # Look forward for j in look_range: if i < len_doc - j: c = doc[i + j] ct = get_chartype(c) char_features.append("t{}={}".format(j, ct)) if ct not in GENERIC_CHARTYPES: if char == "\n": char = "EOL" char_features.append("c{}={}".format(j, c)) else: break dict_start_boundary = "n" if i in dict_start_boundaries: dict_start_boundary = "y" char_features.append("ds=" + dict_start_boundary) dict_end_boundary = "n" if i in dict_end_boundaries: dict_end_boundary = "y" char_features.append("de=" + dict_end_boundary) doc_features.append(char_features) return doc_features
class ReadTagger: def __init__( self, bc_to_id: Dict[str, str], len_linker: int, len_primer: int, *, max_mm: int = 1, use_stats: bool = True ): self.bc_to_id = bc_to_id self.len_linker = len_linker self.len_primer = len_primer self.stats = None if not use_stats else dict( n_only_primer=0, n_multiple_bcs=0, n_no_barcode=0, n_regular=0, n_barcode_mismatch=0, n_junk=0, ) self.automaton = Automaton() all_barcodes, self.blacklist = get_all_barcodes(bc_to_id.keys(), max_mm=max_mm) for pattern, barcode in all_barcodes.items(): self.automaton.add_word(pattern, barcode) self.automaton.make_automaton() def search_barcode(self, read: str) -> Tuple[int, int, str]: for end, barcode in self.automaton.iter(read): start = end - len(barcode) + 1 yield start, end + 1, barcode def tag_read(self, header: str, seq_read: str, seq_qual: str) -> TaggedRead: # as ordered set matches = OrderedDict((match, None) for match in self.search_barcode(seq_read)) match_iter: Iterator[Tuple[int, int, str]] = iter(matches) bc_start, bc_end, barcode = next(match_iter, (None, None, None)) bc_id = self.bc_to_id.get(barcode) other_barcodes = frozenset(set(self.bc_to_id[bc] for _, _, bc in match_iter) - {bc_id}) if barcode is not None: linker_end = bc_end + self.len_linker if bc_end else None junk = seq_read[:bc_start] or None linker = seq_read[bc_end:linker_end] amplicon = seq_read[linker_end:] barcode_mismatch = seq_read[bc_start:bc_end] != barcode else: junk = None linker = None amplicon = seq_read barcode_mismatch = False read = TaggedRead( header, seq_qual, self.len_primer, junk, bc_id, linker, amplicon, other_barcodes, barcode_mismatch, ) if self.stats is not None: for name, pred in PREDS.items(): if pred(read): self.stats[name] += 1 return read def get_barcode_table(self, plain=False): cell_templates = { (True, True): '{}', (True, False): '<span class="b">{}</span>', (False, True): '<span class="a">{}</span>', (False, False): '<span class="both">{}</span>', } patterns = sorted({bc for bc_pairs in self.blacklist.values() for pair in bc_pairs for bc in pair}) sprs = pd.DataFrame(index=patterns, columns=patterns, dtype=str) for pattern, bc_pairs in self.blacklist.items(): for bc1, bc2 in bc_pairs: sprs.loc[bc1, bc2] = ''.join( cell_templates[bc1[i] == base, bc2[i] == base].format(base) for i, base in enumerate(pattern) ) with pd.option_context('display.max_colwidth', -1): html = sprs.to_html(escape=False, na_rep='') if plain: return html return HTML_INTRO + html
if total_words_to_search != total_words_added: words_to_search.append(value) total_words_added += 1 if x == 0: total_initial_words += 1 A.add_word(value, value) print(f"Initial words {total_initial_words}") print(f"Total patterns on AC trie: {total_initial_words*total_iterations+1}") A.make_automaton() start1 = process_time() for word_to_search in words_to_search: start = process_time() end = 0 for match in A.iter(word_to_search): pass end1 = process_time() print( f"Took {end1-start1}sec to match {len(words_to_search)} patterns on a AC automaton with {total_initial_words*total_iterations}" ) #Took 0.0668650930000001sec for 25000 patterns (change var total_words_to_search, above.) #Took 0.23291606600000003sec for 100000 patterns #Took 0.4542991380000001sec for 200000 patterns #Took 0.684820883sec for 300000 patterns # Took 0.061951500999999964sec to match 25000 patterns on a AC automaton with 63000 # Took 0.06499120199999997sec to match 25000 patterns on a AC automaton with 126000 # Took 0.066342342sec to match 25000 patterns on a AC automaton with 189000 # Took 0.07048644500000001sec to match 25000 patterns on a AC automaton with 315000