class Ner(): # 初始化函数 def __init__(self): # 所有实体词集合 self._ner_word_list = [] # 实体词替换的名字 self._ner_name = "" # AC模型的builder self._builder = AcoraBuilder() # 设置实体词集合 def set_ner_word_list(self, ner_word_list): self._ner_word_list = ner_word_list # 设置实体词替换的名字 def set_ner_name(self, ner_name): self._ner_name = ner_name # 构建模型 def build_ner(self): for i in range(len(self._ner_word_list)): self._builder.add(self._ner_word_list[i]) self._tree = self._builder.build() # 命中字符串信息 def hit(self, content_str): hit_list = [] for hit_word, pos in self._tree.finditer(content_str): hit_list.append([hit_word, pos, self._ner_name]) return hit_list
def __init__(self, term_index): self.term_index = term_index builder = AcoraBuilder() for text in term_index: builder.add(text) self.ac = builder.build()
def build_keyword_tries(seqs): builder = AcoraBuilder() for i in range(0, len(seqs)): builder.add(str(seqs[i])) # Add all V tags to keyword trie key = builder.build() return key
def build_keyword_tries(seqs): builder = AcoraBuilder() for i in range(0,len(seqs)): builder.add(str(seqs[i])) # Add all V tags to keyword trie key = builder.build() return key
def __init__(self, keywords, vocab=None): from acora import AcoraBuilder builder = AcoraBuilder() #assert isinstance(keywords, (list,tuple)) self.vocab = vocab for i in keywords: builder.add(i) #Generate the Acora search engine for the current keyword set: self.engine = builder.build()
def _build(self): builder = AcoraBuilder() for idx, item in enumerate(self._regexes_or_assoc): # # First we compile all regular expressions and save them to # the re_cache. # if isinstance(item, tuple): regex = item[0] regex = regex.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) if regex in self._translator: raise ValueError('Duplicated regex "%s"' % regex) self._translator[regex] = item[1:] elif isinstance(item, basestring): regex = item.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) else: raise ValueError('Can NOT build MultiRE with provided values.') # # Now we extract the string literals (longer than hint_len only) from # the regular expressions and populate the acora index # regex_hints = esmre.hints(regex) regex_keywords = esmre.shortlist(regex_hints) if not regex_keywords: self._regexes_with_no_keywords.append(regex) continue # Get the longest one regex_keyword = regex_keywords[0] if len(regex_keyword) <= self._hint_len: self._regexes_with_no_keywords.append(regex) continue # Add this keyword to the acora index, and also save a way to associate the # keyword with the regular expression regex_keyword = regex_keyword.lower() builder.add(regex_keyword) regexes_matching_keyword = self._keyword_to_re.get( regex_keyword, []) regexes_matching_keyword.append(regex) self._keyword_to_re[regex_keyword] = regexes_matching_keyword return builder.build()
def setup(vregions_file, jregions_file): v_end_length = 40 # how many nts at the end of the V region to consider j_start_length = 40 # how many nts at the start of the J region to consider handle = open(vregions_file, 'r') v_list = list(SeqIO.parse(handle, 'fasta')) handle.close() v_genes = [str(string.upper(v.seq)) for v in v_list] v_genes_cut = [v[-v_end_length:] for v in v_genes] all_v_substrings = [] for v in v_genes_cut: all_v_substrings.append([ v[i:i + n] for n in range(4, len(v) + 1) for i in range(len(v) - (n - 1)) ]) t0 = time.time() v_keyword_tries = [] for v_substrings in all_v_substrings: v_builder = AcoraBuilder() for i in range(len(v_substrings)): v_builder.add(v_substrings[i]) v_keyword_tries.append(v_builder.build()) print 'V keyword tries built in', round(time.time() - t0, 2), 'seconds' handle = open(jregions_file, 'r') j_list = list(SeqIO.parse(handle, 'fasta')) handle.close() j_genes = [str(string.upper(j.seq)) for j in j_list] j_genes_cut = [j[:j_start_length] for j in j_genes] all_j_substrings = [] for j in j_genes_cut: all_j_substrings.append([ j[i:i + n] for n in range(4, len(j) + 1) for i in range(len(j) - (n - 1)) ]) t0 = time.time() j_keyword_tries = [] for j_substrings in all_j_substrings: j_builder = AcoraBuilder() for i in range(len(j_substrings)): j_builder.add(j_substrings[i]) j_keyword_tries.append(j_builder.build()) print 'J keyword tries built in', round(time.time() - t0, 2), 'seconds' return v_keyword_tries, j_keyword_tries, v_genes, j_genes
def _build(self): builder = AcoraBuilder() for idx, item in enumerate(self._regexes_or_assoc): # # First we compile all regular expressions and save them to # the re_cache. # if isinstance(item, tuple): regex = item[0] regex = regex.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) if regex in self._translator: raise ValueError('Duplicated regex "%s"' % regex) self._translator[regex] = item[1:] elif isinstance(item, basestring): regex = item.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) else: raise ValueError('Can NOT build MultiRE with provided values.') # # Now we extract the string literals (longer than hint_len only) from # the regular expressions and populate the acora index # regex_hints = esmre.hints(regex) regex_keywords = esmre.shortlist(regex_hints) if not regex_keywords: self._regexes_with_no_keywords.append(regex) continue # Get the longest one regex_keyword = regex_keywords[0] if len(regex_keyword) <= self._hint_len: self._regexes_with_no_keywords.append(regex) continue # Add this keyword to the acora index, and also save a way to associate the # keyword with the regular expression regex_keyword = regex_keyword.lower() builder.add(regex_keyword) regexes_matching_keyword = self._keyword_to_re.get(regex_keyword, []) regexes_matching_keyword.append(regex) self._keyword_to_re[regex_keyword] = regexes_matching_keyword return builder.build()
class Acora(object): def __init__(self,dic): self.__builder = AcoraBuilder() fp = open(dic) for line in fp: self.__builder.add(line.rstrip("\n").decode("utf-8")) fp.close() self.__tree = self.__builder.build() def findall(self,content): hitList = [] for hitWord, pos in self.__tree.finditer(content): hitList.append(hitWord) return hitList
def setup(vregions_file, jregions_file): v_end_length = 40 # how many nts at the end of the V region to consider j_start_length = 40 # how many nts at the start of the J region to consider handle = open(vregions_file, 'r') v_list = list(SeqIO.parse(handle, 'fasta')) handle.close() v_genes = [str(string.upper(v.seq)) for v in v_list] v_genes_cut = [v[-v_end_length:] for v in v_genes] all_v_substrings = [] for v in v_genes_cut: all_v_substrings.append([v[i:i+n] for n in range(4, len(v)+1) for i in range(len(v)-(n-1))]) t0 = time.time() v_keyword_tries = [] for v_substrings in all_v_substrings: v_builder = AcoraBuilder() for i in range(len(v_substrings)): v_builder.add(v_substrings[i]) v_keyword_tries.append(v_builder.build()) print 'V keyword tries built in', round(time.time() - t0, 2), 'seconds' handle = open(jregions_file, 'r') j_list = list(SeqIO.parse(handle, 'fasta')) handle.close() j_genes = [str(string.upper(j.seq)) for j in j_list] j_genes_cut = [j[:j_start_length] for j in j_genes] all_j_substrings = [] for j in j_genes_cut: all_j_substrings.append([j[i:i+n] for n in range(4, len(j)+1) for i in range(len(j)-(n-1))]) t0 = time.time() j_keyword_tries = [] for j_substrings in all_j_substrings: j_builder = AcoraBuilder() for i in range(len(j_substrings)): j_builder.add(j_substrings[i]) j_keyword_tries.append(j_builder.build()) print 'J keyword tries built in', round(time.time() - t0, 2), 'seconds' return v_keyword_tries, j_keyword_tries, v_genes, j_genes
def __init__(self, use_unicode=True, ignore_case=False, titles=None, extra_titles=None): """ :param use_unicode: whether to use `titles` as unicode or bytestrings :param ignore_case: if True ignore case in all matches :param titles: if given, overrides default `load_titles()` values :param extra_titles: if given, add to titles """ titles = titles if titles else load_titles() titles = (titles if use_unicode else (s.encode('ascii') for s in titles)) builder = AcoraBuilder() logging.info('building job title searcher') builder.update(titles) if extra_titles: builder.add(extra_titles) self.ac = builder.build(ignore_case=ignore_case) logging.info('building done')
def _build(self): builder = AcoraBuilder() for idx, item in enumerate(self._keywords_or_assoc): if isinstance(item, tuple): keyword = item[0] keyword = keyword.encode(DEFAULT_ENCODING) if keyword in self._translator: raise ValueError('Duplicated keyword "%s"' % keyword) self._translator[keyword] = item[1:] builder.add(keyword) elif isinstance(item, basestring): keyword = item.encode(DEFAULT_ENCODING) builder.add(keyword) else: raise ValueError('Can NOT build MultiIn with provided values.') return builder.build()
def analysis( Sequence_Reads, with_statistics=True, with_reverse_complement_search=True): import numpy as np import decimal as dec import string import operator as op import collections as coll from Bio import SeqIO from acora import AcoraBuilder from time import time, clock from string import Template from operator import itemgetter, attrgetter import Levenshtein as lev v_half_split, j_half_split = [10,6] # Do not change - V tags are split at position 10, J at position 6, to look for half tags if no full tag is found. ################ print 'Commencing analysis on a total of', len(Sequence_Reads), 'file(s)' ## Create .txt file to store f=(v_index,j_index,v_deletions,j_deletions,nt_insert) analysis_file = open("DecombinatorResults.txt", "w") analysis_file.close() results = "DecombinatorResults.txt" # Name the .txt file to write to ################ print ('Importing known V, D and J gene segments and tags...') handle = open("human_TRBV_region.fasta", "rU") v_genes = list(SeqIO.parse(handle, "fasta")) handle.close() handle = open("human_TRBJ_region.fasta", "rU") j_genes = list(SeqIO.parse(handle, "fasta")) handle.close() v_regions = [] for j in range(0, len(v_genes)): v_regions.append(string.upper(v_genes[j].seq)) j_regions = [] for j in range(0, len(j_genes)): j_regions.append(string.upper(j_genes[j].seq)) ############## ## Build keyword tries of V and J tags for fast assignment v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_trbv.txt", "rU"), v_half_split) j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_trbj.txt", "rU"), j_half_split) v_builder = AcoraBuilder() for i in range(0,len(v_seqs)): v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie v_key = v_builder.build() j_builder = AcoraBuilder() for i in range(0,len(j_seqs)): j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie j_key = j_builder.build() ############## ## Build keyword tries for first and second halves of both V and J tags v_half1_builder = AcoraBuilder() for i in range(0,len(half1_v_seqs)): v_half1_builder.add(str(half1_v_seqs[i])) half1_v_key = v_half1_builder.build() v_half2_builder = AcoraBuilder() for i in range(0,len(half2_v_seqs)): v_half2_builder.add(str(half2_v_seqs[i])) half2_v_key = v_half2_builder.build() j_half1_builder = AcoraBuilder() for i in range(0,len(half1_j_seqs)): j_half1_builder.add(str(half1_j_seqs[i])) half1_j_key = j_half1_builder.build() j_half2_builder = AcoraBuilder() for i in range(0,len(half2_j_seqs)): j_half2_builder.add(str(half2_j_seqs[i])) half2_j_key = j_half2_builder.build() ############### ## Initialise variables assigned_count = 0 # this will just increase by one every time we correctly assign a seq read with all desired variables seq_count = 0 # this will simply track the number of sequences analysed in file t0 = time() # Begin timer ############### ## Open .txt file created at the start of analysis analysis_file = open(results, "a") stemplate = Template('$v $j $del_v $del_j $nt_insert') # Creates stemplate, a holder, for f. Each line will have the 5 variables separated by a space ############### ## Begin analysing sequences for i in range(len(Sequence_Reads)): print 'Importing sequences from', Sequence_Reads[i],' and assigning V and J regions...' handle = open(Sequence_Reads[i], "rU") for record in SeqIO.parse(handle, "fastq"): found_seq_match = 0 seq_count += 1 hold_v = v_key.findall(str(record.seq)) hold_j = j_key.findall(str(record.seq)) if hold_v: v_match = v_seqs.index(hold_v[0][0]) # Assigns V temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found [ end_v, deletions_v] = get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) else: found_v_match = 0 hold_v1 = half1_v_key.findall(str(record.seq)) hold_v2 = half2_v_key.findall(str(record.seq)) for i in range(len(hold_v1)): indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 for i in range(len(hold_v2)): indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 if hold_j: j_match = j_seqs.index(hold_j[0][0]) # Assigns J temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be if get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): [ start_j, deletions_j] = get_j_deletions( record.seq, j_match, temp_start_j, j_regions ) else: found_j_match = 0 hold_j1 = half1_j_key.findall(str(record.seq)) hold_j2 = half2_j_key.findall(str(record.seq)) for i in range(len(hold_j1)): indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: j_match = half1_j_seqs.index(hold_j1[i][0]) temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be found_j_match += 1 for i in range(len(hold_j2)): indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1: j_match = half2_j_seqs.index(hold_j2[i][0]) temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be found_j_match += 1 if hold_v and hold_j: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence assigned_count += 1 found_seq_match = 1 elif hold_v and found_j_match == 1: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and hold_j: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and found_j_match == 1: if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 if found_seq_match == 0 and with_reverse_complement_search == True: ##################### # REVERSE COMPLEMENT ##################### record_reverse = record.reverse_complement() hold_v = v_key.findall(str(record_reverse.seq)) hold_j = j_key.findall(str(record_reverse.seq)) if hold_v: v_match = v_seqs.index(hold_v[0][0]) # Assigns V temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found [ end_v, deletions_v] = get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) else: found_v_match = 0 hold_v1 = half1_v_key.findall(str(record_reverse.seq)) hold_v2 = half2_v_key.findall(str(record_reverse.seq)) for i in range(len(hold_v1)): indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]): if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 for i in range(len(hold_v2)): indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ] for k in indices: if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]): if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1: v_match = k temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be found_v_match += 1 if hold_j: j_match = j_seqs.index(hold_j[0][0]) # Assigns J temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be if get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): [ start_j, deletions_j] = get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ) else: found_j_match = 0 hold_j1 = half1_j_key.findall(str(record_reverse.seq)) hold_j2 = half2_j_key.findall(str(record_reverse.seq)) for i in range(len(hold_j1)): indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]): if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1: j_match = half1_j_seqs.index(hold_j1[i][0]) temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be found_j_match += 1 for i in range(len(hold_j2)): indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ] for k in indices: if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]): if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1: j_match = half2_j_seqs.index(hold_j2[i][0]) temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be found_j_match += 1 if hold_v and hold_j: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence assigned_count += 1 found_seq_match = 1 elif hold_v and found_j_match == 1: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and hold_j: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 elif found_v_match == 1 and found_j_match == 1: if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ): f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j])) print >> analysis_file, f_seq assigned_count += 1 found_seq_match = 1 handle.close() analysis_file.close() if with_statistics == True: timed = time() - t0 print seq_count, 'sequences were analysed' print assigned_count, ' sequences were successfully assigned' print 'Time taken =', timed, 'seconds'
def __init__(self, text): self.text = text keywords = ["ownership", "owner", "own", "propietary", "tracking", "track", "store", "keep", "keeping"] builder = AcoraBuilder() builder.add(*keywords) self.finder = builder.build()
for protein, seq, blank in fxn.read_fa(in_file): mouse_proteins[protein.split(' ')[0]] = seq # Then scroll through non-predicted binder files, build an AC trie of all the peptides per file data_dir = '../Data/NonPredictedBinders/' matches = coll.defaultdict(fxn.nest_counter) all_peptides = coll.defaultdict(list) for f in [x for x in os.listdir(data_dir) if x.endswith('.txt')]: nam = f.split('-')[0] search_builder = AcoraBuilder() peptides = [] # Build trie with open(data_dir + f, 'rU') as in_file: for line in in_file: search_builder.add(line.rstrip()) peptides.append(line.rstrip()) all_peptides[f.split('-')[0]].append(line.rstrip()) seq_search = search_builder.build() # Use to search all proteins in proteome for protein in mouse_proteins: seq_check = seq_search.findall(mouse_proteins[protein]) if seq_check: for s in seq_check: matches[nam][s[0]] += 1 # Then fill in the zeroes (unmatched peptides) to get denominator for p in peptides: if p not in matches[nam]: matches[nam][p] = 0
v_regions.append(str(v_genes[v].seq).upper()) v_nams.append(v_genes[v].id.split("|")[1]) j_regions = [] j_nams = [] for j in range(0, len(j_genes)): j_regions.append(str(j_genes[j].seq).upper()) j_nams.append(v_genes[v].id.split("|")[1]) ## Build keyword tries of V and J tags for fast assignment v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_tr"+ chain.lower() + "v.txt", "rU"), v_half_split) j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_tr"+ chain.lower() + "j.txt", "rU"), j_half_split) v_builder = AcoraBuilder() for i in range(0,len(v_seqs)): v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie v_key = v_builder.build() j_builder = AcoraBuilder() for i in range(0,len(j_seqs)): j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie j_key = j_builder.build() ## Build keyword tries for first and second halves of both V and J tags v_half1_builder = AcoraBuilder() for i in range(0,len(half1_v_seqs)): v_half1_builder.add(str(half1_v_seqs[i])) half1_v_key = v_half1_builder.build()