Exemplo n.º 1
0
class Ner():

    # 初始化函数
    def __init__(self):
        # 所有实体词集合
        self._ner_word_list = []

        # 实体词替换的名字
        self._ner_name = ""

        # AC模型的builder
        self._builder = AcoraBuilder()

    # 设置实体词集合
    def set_ner_word_list(self, ner_word_list):
        self._ner_word_list = ner_word_list

    # 设置实体词替换的名字
    def set_ner_name(self, ner_name):
        self._ner_name = ner_name

    # 构建模型
    def build_ner(self):
        for i in range(len(self._ner_word_list)):
            self._builder.add(self._ner_word_list[i])

        self._tree = self._builder.build()

    # 命中字符串信息
    def hit(self, content_str):
        hit_list = []
        for hit_word, pos in self._tree.finditer(content_str):
            hit_list.append([hit_word, pos, self._ner_name])

        return hit_list
Exemplo n.º 2
0
    def __init__(self, term_index):
        self.term_index = term_index

        builder = AcoraBuilder()
        for text in term_index:
            builder.add(text)
        self.ac = builder.build()
Exemplo n.º 3
0
    def match_lines(self, s, *keywords):
        '''
        Searching for the specific keywords
 
        @param s  The Filename.
        @param Keywords  The List which contains two keywords (index 0 - is primary key and index 1 is the parameter).
        
        @returns Lines where the keywords present.
        '''

        builder = AcoraBuilder('\r', '\n', *keywords)
        ac = builder.build()

        line_start = 0
        matches = False
        for kw, pos in ac.finditer(s):
            if kw in '\r\n':
                if matches:
                    yield s[line_start:pos]
                    matches = False
                line_start = pos + 1
            else:
                matches = True
        if matches:
            yield s[line_start:]
def build_keyword_tries(seqs):

    builder = AcoraBuilder()
    for i in range(0,len(seqs)):
        builder.add(str(seqs[i])) # Add all V tags to keyword trie

    key = builder.build()
    return key
Exemplo n.º 5
0
def build_keyword_tries(seqs):

    builder = AcoraBuilder()
    for i in range(0, len(seqs)):
        builder.add(str(seqs[i]))  # Add all V tags to keyword trie

    key = builder.build()
    return key
Exemplo n.º 6
0
def compare_search(s, filename, ignore_case, *keywords):
    setup_pya = setup_cya = setup_re = 0
    run_pa = 'pa' in COMPARED_IMPLEMENTATIONS
    run_ca = 'ca' in COMPARED_IMPLEMENTATIONS
    run_re = 're' in COMPARED_IMPLEMENTATIONS

    if run_pa:
        t = time()
        builder = AcoraBuilder(keywords, ignore_case=ignore_case)
        py_acora = builder.build(acora=PyAcora)
        setup_pya = time() - t
        t = time()
    if run_ca:
        t = time()
        builder = AcoraBuilder(keywords, ignore_case=ignore_case)
        c_acora = builder.build()
        setup_ca = time() - t
    if run_re:
        t = time()
        if hasattr(keywords[0], 'encode'): # unicode in Py3?
            kw_regexp = '|'.join(keywords)
        else:
            kw_regexp = '|'.encode('ASCII').join(keywords)
        if ignore_case:
            regexp = re.compile(kw_regexp, re.I)
        else:
            regexp = re.compile(kw_regexp)
        setup_re = time() - t
    print("Case %ssensitive %s\n- setup times: PA: %.4f, CA: %.4f, RE: %.4f" % (
            ignore_case and 'in' or '',
            builder.for_unicode and 'unicode' or 'bytes',
            setup_pya, setup_ca, setup_re))

    if run_pa:
        timings = timeit.Timer(partial(py_acora.findall, s)).repeat(number=REPEAT_COUNT)
        print("TIME(paS): %.3f" % min(timings))
    if run_ca:
        timings = timeit.Timer(partial(c_acora.findall, s)).repeat(number=REPEAT_COUNT)
        print("TIME(caS): %.3f" % min(timings))
    if filename:
        if run_pa:
            timings = timeit.Timer(partial(py_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
            print("TIME(paF): %.3f" % min(timings))
        if run_ca:
            timings = timeit.Timer(partial(c_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
            print("TIME(caF): %.3f" % min(timings))
    if run_re:
        timings = timeit.Timer(partial(regexp.findall, s)).repeat(number=REPEAT_COUNT)
        print("TIME(reS): %.3f" % min(timings))

    return (
        run_pa and py_acora.findall(s) or None,
        run_ca and c_acora.findall(s) or None,
        run_pa and (filename and py_acora.filefindall(filename)) or None,
        run_ca and (filename and c_acora.filefindall(filename)) or None,
        run_re and regexp.findall(s) or None
        )
Exemplo n.º 7
0
    def __init__(self):
        # 所有实体词集合
        self._ner_word_list = []

        # 实体词替换的名字
        self._ner_name = ""

        # AC模型的builder
        self._builder = AcoraBuilder()
Exemplo n.º 8
0
    def __init__(self, keywords, vocab=None):
        from acora import AcoraBuilder
        builder = AcoraBuilder()
        #assert isinstance(keywords, (list,tuple))
        self.vocab = vocab
        for i in keywords:
            builder.add(i)

        #Generate the Acora search engine for the current keyword set:
        self.engine = builder.build()
Exemplo n.º 9
0
    def _build(self):
        builder = AcoraBuilder()

        for idx, item in enumerate(self._regexes_or_assoc):

            #
            #   First we compile all regular expressions and save them to
            #   the re_cache.
            #
            if isinstance(item, tuple):
                regex = item[0]
                regex = regex.encode(DEFAULT_ENCODING)
                self._re_cache[regex] = re.compile(regex,
                                                   self._re_compile_flags)

                if regex in self._translator:
                    raise ValueError('Duplicated regex "%s"' % regex)

                self._translator[regex] = item[1:]
            elif isinstance(item, basestring):
                regex = item.encode(DEFAULT_ENCODING)
                self._re_cache[regex] = re.compile(regex,
                                                   self._re_compile_flags)
            else:
                raise ValueError('Can NOT build MultiRE with provided values.')

            #
            #   Now we extract the string literals (longer than hint_len only) from
            #   the regular expressions and populate the acora index
            #
            regex_hints = esmre.hints(regex)
            regex_keywords = esmre.shortlist(regex_hints)

            if not regex_keywords:
                self._regexes_with_no_keywords.append(regex)
                continue

            # Get the longest one
            regex_keyword = regex_keywords[0]

            if len(regex_keyword) <= self._hint_len:
                self._regexes_with_no_keywords.append(regex)
                continue

            # Add this keyword to the acora index, and also save a way to associate the
            # keyword with the regular expression
            regex_keyword = regex_keyword.lower()
            builder.add(regex_keyword)

            regexes_matching_keyword = self._keyword_to_re.get(
                regex_keyword, [])
            regexes_matching_keyword.append(regex)
            self._keyword_to_re[regex_keyword] = regexes_matching_keyword

        return builder.build()
Exemplo n.º 10
0
def filter_text(iterable, text):
    b = AcoraBuilder(text.lower())
    ac = b.build()
    def m(obj):
        for _,_,o in obj["_graph"]:
            if isinstance(o, Literal):
                for _ in ac.findall(o.lower()):
                    return True
        return False
    for obj in iterable:
        if m(obj):
            yield obj
Exemplo n.º 11
0
 def __init__(self, use_unicode=True, ignore_case=False, titles=None):
     """
     :param use_unicode: whether to use `titles` as unicode or bytestrings
     :param ignore_case: if True ignore case in all matches
     :param titles: if given, overrides default `load_titles()` values
     """
     titles = titles if titles else load_titles()
     titles = (titles if use_unicode else
               (s.encode('ascii') for s in titles))
     builder = AcoraBuilder()
     builder.update(titles)
     self.ac = builder.build(ignore_case=ignore_case)
Exemplo n.º 12
0
def setup(vregions_file, jregions_file):

    v_end_length = 40  # how many nts at the end of the V region to consider
    j_start_length = 40  # how many nts at the start of the J region to consider

    handle = open(vregions_file, 'r')
    v_list = list(SeqIO.parse(handle, 'fasta'))
    handle.close()
    v_genes = [str(string.upper(v.seq)) for v in v_list]
    v_genes_cut = [v[-v_end_length:] for v in v_genes]

    all_v_substrings = []
    for v in v_genes_cut:
        all_v_substrings.append([
            v[i:i + n] for n in range(4,
                                      len(v) + 1)
            for i in range(len(v) - (n - 1))
        ])

    t0 = time.time()
    v_keyword_tries = []
    for v_substrings in all_v_substrings:
        v_builder = AcoraBuilder()
        for i in range(len(v_substrings)):
            v_builder.add(v_substrings[i])
        v_keyword_tries.append(v_builder.build())
    print 'V keyword tries built in', round(time.time() - t0, 2), 'seconds'

    handle = open(jregions_file, 'r')
    j_list = list(SeqIO.parse(handle, 'fasta'))
    handle.close()
    j_genes = [str(string.upper(j.seq)) for j in j_list]
    j_genes_cut = [j[:j_start_length] for j in j_genes]

    all_j_substrings = []
    for j in j_genes_cut:
        all_j_substrings.append([
            j[i:i + n] for n in range(4,
                                      len(j) + 1)
            for i in range(len(j) - (n - 1))
        ])

    t0 = time.time()
    j_keyword_tries = []
    for j_substrings in all_j_substrings:
        j_builder = AcoraBuilder()
        for i in range(len(j_substrings)):
            j_builder.add(j_substrings[i])
        j_keyword_tries.append(j_builder.build())
    print 'J keyword tries built in', round(time.time() - t0, 2), 'seconds'

    return v_keyword_tries, j_keyword_tries, v_genes, j_genes
Exemplo n.º 13
0
    def _build(self):
        builder = AcoraBuilder()

        for idx, item in enumerate(self._regexes_or_assoc):

            #
            #   First we compile all regular expressions and save them to
            #   the re_cache.
            #
            if isinstance(item, tuple):
                regex = item[0]
                regex = regex.encode(DEFAULT_ENCODING)
                self._re_cache[regex] = re.compile(regex, self._re_compile_flags)

                if regex in self._translator:
                    raise ValueError('Duplicated regex "%s"' % regex)

                self._translator[regex] = item[1:]
            elif isinstance(item, basestring):
                regex = item.encode(DEFAULT_ENCODING)
                self._re_cache[regex] = re.compile(regex, self._re_compile_flags)
            else:
                raise ValueError('Can NOT build MultiRE with provided values.')

            #
            #   Now we extract the string literals (longer than hint_len only) from
            #   the regular expressions and populate the acora index
            #
            regex_hints = esmre.hints(regex)
            regex_keywords = esmre.shortlist(regex_hints)

            if not regex_keywords:
                self._regexes_with_no_keywords.append(regex)
                continue

            # Get the longest one
            regex_keyword = regex_keywords[0]

            if len(regex_keyword) <= self._hint_len:
                self._regexes_with_no_keywords.append(regex)
                continue

            # Add this keyword to the acora index, and also save a way to associate the
            # keyword with the regular expression
            regex_keyword = regex_keyword.lower()
            builder.add(regex_keyword)

            regexes_matching_keyword = self._keyword_to_re.get(regex_keyword, [])
            regexes_matching_keyword.append(regex)
            self._keyword_to_re[regex_keyword] = regexes_matching_keyword

        return builder.build()
 def __init__(self, content: List[str], ignore_case: bool):
     """
     Acora matcher factory
     :param content: a list of items to search
     :param ignore_case: True to match any case
     :return: a built matcher
     """
     # start with a string in case content is empty
     # otherwise it builds a binary Acora matcher
     builder = AcoraBuilder("!@#$%%^&*")
     if len(content) > 0:
         builder.update(content)
     self.matcher = builder.build(ignore_case=ignore_case)
Exemplo n.º 15
0
    def test_acora_python(self):
        builder = AcoraBuilder()
        builder.update([s for (s,) in SQL_ERRORS])
        ac = builder.build(acora=PyAcora)

        i = 0

        #
        # This takes around 9 seconds in my workstation.
        #
        for j in xrange(self.ITERATIONS):
            for _ in ac.finditer(HTTP_RESPONSE):
                i += 1

        self.assertEqual(i, self.ITERATIONS * 2)
Exemplo n.º 16
0
    def __init__(self, keywords: Optional[Iterable[str]] = []):
        non_empty_keywords = []
        if keywords is not None:
            for w in keywords:
                if w.strip() != "":
                    non_empty_keywords.append(w)

        self._keywords = set(non_empty_keywords)

        if len(self._keywords) > 0:
            ac_builder = AcoraBuilder()
            ac_builder.update(keywords)
            self._finder = ac_builder.build()
        else:
            self._finder = None
Exemplo n.º 17
0
Arquivo: sma.py Projeto: yflau/dsapp
class Acora(object):

    def __init__(self,dic):
        self.__builder = AcoraBuilder()
        fp = open(dic)
        for line in fp:
            self.__builder.add(line.rstrip("\n").decode("utf-8"))
        fp.close()
        self.__tree = self.__builder.build()

    def findall(self,content):
        hitList = []
        for hitWord, pos in self.__tree.finditer(content):
            hitList.append(hitWord)
        return hitList
Exemplo n.º 18
0
Arquivo: sma.py Projeto: yflau/dsapp
 def __init__(self,dic):
     self.__builder = AcoraBuilder()
     fp = open(dic)
     for line in fp:
         self.__builder.add(line.rstrip("\n").decode("utf-8"))
     fp.close()
     self.__tree = self.__builder.build()
Exemplo n.º 19
0
Arquivo: graph.py Projeto: cjx3721/QA
	def directed_graph(self) :
		if not hasattr(self, "_directed_graph") :
			print "getting directed graph ..."
			
			graph = defaultdict(_dd_int)
			# Zhu: in my VM, build speed is about 1.4w entity / s
			ac = AcoraBuilder(*self.database.entities).build()
			
			# match consumes no time, compared to build
			for text, attrib in self.database :
				entities = zip(*longest_match(ac.finditer(text)))[0]
				for entity in set(entities) :
					if entity == attrib["title"] :
						continue
					graph[attrib["title"]][entity] += 1
			
			delattr(self, "database")
			self._directed_graph = graph
			
		return self._directed_graph
Exemplo n.º 20
0
def get_key_word(data):
    output_database = []
    if len(data["entity_dict"]) >= 1:
        dicts = OrderedDict()
        for key in data["entity_dict"]:
            dicts[key] = key
            for t in data["entity_dict"][key]:
                dicts[t] = key
        query = data["query"]
        key_word_builder = AcoraBuilder(dicts.keys())
        key_word_searcher = key_word_builder.build()
        print(dicts, "------detected diccts-------")
        res = key_word_searcher.findall(query)
        print(res)
        if len(res) >= 1:
            input_entity = [item[0] for item in res]
            input_entity_key = []
            for char in input_entity:
                input_entity_key.extend(data["entity_dict"][dicts[char]])
                input_entity_key.append(dicts[char])
            input_key_entity = list(set(input_entity_key))
            key_word_builder = AcoraBuilder(input_key_entity)
            key_word_searcher = key_word_builder.build()
            for data in data["database"]:
                t = len(key_word_searcher.findall(data))
                output_database.append(t)
        else:
            for data in data["database"]:
                output_database.append(0)
    else:
        for data in data["database"]:
            output_database.append(0)
    return output_database
Exemplo n.º 21
0
def compare_search(s, filename, ignore_case, *keywords):
    setup_pya = setup_cya = setup_re = 0
    run_pa = 'pa' in COMPARED_IMPLEMENTATIONS
    run_ca = 'ca' in COMPARED_IMPLEMENTATIONS
    run_re = 're' in COMPARED_IMPLEMENTATIONS

    if run_pa:
        t = time()
        builder = AcoraBuilder(keywords, ignore_case=ignore_case)
        py_acora = builder.build(acora=PyAcora)
        setup_pya = time() - t
        t = time()
    if run_ca:
        t = time()
        builder = AcoraBuilder(keywords, ignore_case=ignore_case)
        c_acora = builder.build()
        setup_ca = time() - t
    if run_re:
        t = time()
        if hasattr(keywords[0], 'encode'):  # unicode in Py3?
            kw_regexp = '|'.join(keywords)
        else:
            kw_regexp = '|'.encode('ASCII').join(keywords)
        if ignore_case:
            regexp = re.compile(kw_regexp, re.I)
        else:
            regexp = re.compile(kw_regexp)
        setup_re = time() - t
    print("Case %ssensitive %s\n- setup times: PA: %.4f, CA: %.4f, RE: %.4f" %
          (ignore_case and 'in' or '', builder.for_unicode and 'unicode'
           or 'bytes', setup_pya, setup_ca, setup_re))

    if run_pa:
        timings = timeit.Timer(partial(py_acora.findall,
                                       s)).repeat(number=REPEAT_COUNT)
        print("TIME(paS): %.3f" % min(timings))
    if run_ca:
        timings = timeit.Timer(partial(c_acora.findall,
                                       s)).repeat(number=REPEAT_COUNT)
        print("TIME(caS): %.3f" % min(timings))
    if filename:
        if run_pa:
            timings = timeit.Timer(partial(
                py_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
            print("TIME(paF): %.3f" % min(timings))
        if run_ca:
            timings = timeit.Timer(partial(
                c_acora.filefindall, filename)).repeat(number=REPEAT_COUNT)
            print("TIME(caF): %.3f" % min(timings))
    if run_re:
        timings = timeit.Timer(partial(regexp.findall,
                                       s)).repeat(number=REPEAT_COUNT)
        print("TIME(reS): %.3f" % min(timings))

    return (run_pa and py_acora.findall(s)
            or None, run_ca and c_acora.findall(s)
            or None, run_pa and (filename and py_acora.filefindall(filename))
            or None, run_ca and (filename and c_acora.filefindall(filename))
            or None, run_re and regexp.findall(s) or None)
Exemplo n.º 22
0
def setup(vregions_file, jregions_file):

    v_end_length = 40  # how many nts at the end of the V region to consider
    j_start_length = 40  # how many nts at the start of the J region to consider

    handle = open(vregions_file, 'r')
    v_list = list(SeqIO.parse(handle, 'fasta'))
    handle.close()
    v_genes = [str(string.upper(v.seq)) for v in v_list]
    v_genes_cut = [v[-v_end_length:] for v in v_genes]

    all_v_substrings = []
    for v in v_genes_cut:
        all_v_substrings.append([v[i:i+n] for n in range(4, len(v)+1) for i in range(len(v)-(n-1))])

    t0 = time.time()
    v_keyword_tries = []
    for v_substrings in all_v_substrings:
        v_builder = AcoraBuilder()
        for i in range(len(v_substrings)):
            v_builder.add(v_substrings[i])
        v_keyword_tries.append(v_builder.build())
    print 'V keyword tries built in', round(time.time() - t0, 2), 'seconds'

    handle = open(jregions_file, 'r')
    j_list = list(SeqIO.parse(handle, 'fasta'))
    handle.close()
    j_genes = [str(string.upper(j.seq)) for j in j_list]
    j_genes_cut = [j[:j_start_length] for j in j_genes]

    all_j_substrings = []
    for j in j_genes_cut:
        all_j_substrings.append([j[i:i+n] for n in range(4, len(j)+1) for i in range(len(j)-(n-1))])

    t0 = time.time()
    j_keyword_tries = []
    for j_substrings in all_j_substrings:
        j_builder = AcoraBuilder()
        for i in range(len(j_substrings)):
            j_builder.add(j_substrings[i])
        j_keyword_tries.append(j_builder.build())
    print 'J keyword tries built in', round(time.time() - t0, 2), 'seconds'

    return v_keyword_tries, j_keyword_tries, v_genes, j_genes
Exemplo n.º 23
0
    def _build(self):
        builder = AcoraBuilder()

        for idx, item in enumerate(self._keywords_or_assoc):

            if isinstance(item, tuple):
                keyword = item[0]
                keyword = keyword.encode(DEFAULT_ENCODING)

                if keyword in self._translator:
                    raise ValueError('Duplicated keyword "%s"' % keyword)

                self._translator[keyword] = item[1:]

                builder.add(keyword)
            elif isinstance(item, basestring):
                keyword = item.encode(DEFAULT_ENCODING)
                builder.add(keyword)
            else:
                raise ValueError('Can NOT build MultiIn with provided values.')

        return builder.build()
Exemplo n.º 24
0
    def _build(self):
        builder = AcoraBuilder()

        for idx, item in enumerate(self._keywords_or_assoc):

            if isinstance(item, tuple):
                keyword = item[0]
                keyword = keyword.encode(DEFAULT_ENCODING)

                if keyword in self._translator:
                    raise ValueError('Duplicated keyword "%s"' % keyword)

                self._translator[keyword] = item[1:]

                builder.add(keyword)
            elif isinstance(item, basestring):
                keyword = item.encode(DEFAULT_ENCODING)
                builder.add(keyword)
            else:
                raise ValueError('Can NOT build MultiIn with provided values.')

        return builder.build()
Exemplo n.º 25
0
v_nams = []
for v in range(0, len(v_genes)):
  v_regions.append(str(v_genes[v].seq).upper())
  v_nams.append(v_genes[v].id.split("|")[1])

j_regions = []
j_nams = [] 
for j in range(0, len(j_genes)):
  j_regions.append(str(j_genes[j].seq).upper())
  j_nams.append(v_genes[v].id.split("|")[1])

## Build keyword tries of V and J tags for fast assignment
v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_tr"+ chain.lower() + "v.txt", "rU"), v_half_split)
j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_tr"+ chain.lower() + "j.txt", "rU"), j_half_split)

v_builder = AcoraBuilder()
for i in range(0,len(v_seqs)):
    v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie

v_key = v_builder.build()

j_builder = AcoraBuilder()
for i in range(0,len(j_seqs)):
    j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie

j_key = j_builder.build()

## Build keyword tries for first and second halves of both V and J tags
v_half1_builder = AcoraBuilder()
for i in range(0,len(half1_v_seqs)):
    v_half1_builder.add(str(half1_v_seqs[i]))
Exemplo n.º 26
0
def analysis(fastqs, vfasta, jfasta, vtags, jtags, rev_comp=False,
                verbose=False, sep=" "):
    if verbose:
        sys.stderr.write('>> Analyzing %d file(s)\n' % len(fastqs))
        sys.stderr.write(">> Importing known V, and J gene segments and tags\n")

    # get the sequences per region
    v_genes = list(SeqIO.parse(nopen(vfasta), "fasta"))
    j_genes = list(SeqIO.parse(nopen(jfasta), "fasta"))
    # XXX
    # classes to parse fasta, fastq, and method to reverse complement
    # get rid of biopython
    v_regions = [str(v_genes[i].seq.upper()) for i, v in enumerate(v_genes)]
    j_regions = [str(j_genes[i].seq.upper()) for i, v in enumerate(j_genes)]

    v_seqs, vleft_seqs, vright_seqs, v_ends = get_tags(vtags)
    j_seqs, jleft_seqs, jright_seqs, j_starts = get_tags(jtags)

    # full sequences
    builder = AcoraBuilder(v_seqs)
    v_key = builder.build()
    builder = AcoraBuilder(j_seqs)
    j_key = builder.build()
    
    # half sequences
    builder = AcoraBuilder(vleft_seqs)
    vleft_key = builder.build()
    builder = AcoraBuilder(vright_seqs)
    vright_key = builder.build()
    builder = AcoraBuilder(jleft_seqs)
    jleft_key = builder.build()
    builder = AcoraBuilder(jright_seqs)
    jright_key = builder.build()
    
    # correctly assigned sequences
    assigned_count = 0
    # number of sequences analysed
    seq_count = 0
    # begin clock
    t0 = time()
    
    # XXX
    stemplate = Template('$v $j $del_v $del_j $nt_insert')

    for fastq in fastqs:
        if verbose:
            sys.stderr.write(">> Starting %s...\n" % fastq)
        for i, record in enumerate(SeqIO.parse(nopen(fastq), "fastq")):
            # if i == 50:
            #     sys.exit()
            found_seq_match = 0
            seq_count += 1
            hold_v = v_key.findall(str(record.seq))
            hold_j = j_key.findall(str(record.seq))

            if hold_v:
                # the index position of the found sequence among known (v_seqs)
                v_match = v_seqs.index(hold_v[0][0])
                
                # new variable names
                # do not like lists for this task
                match_idx = v_seqs.index(hold_v[0][0])
                match_start_idx = hold_v[0][1]
                vseq_end = v_ends[match_idx] - 1
                end_of_v = match_start_idx + vseq_end
                
                # Finds where the end of a full V would be
                temp_end_v = hold_v[0][1] + v_ends[v_match] - 1
                
                # If the number of deletions has been found
                if get_v_deletions(record.seq, v_match, temp_end_v, v_regions):
                    end_v, deletions_v = get_v_deletions(record.seq, v_match, temp_end_v, v_regions)
            else:
                found_v_match = 0
                hold_v1 = vleft_key.findall(str(record.seq))
                hold_v2 = vright_key.findall(str(record.seq))
                for i in range(len(hold_v1)):
                    indices = [y for y, x in enumerate(vleft_seqs) if x == hold_v1[i][0] ]
                    for k in indices:
                        if len(v_seqs[k]) == len(str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[vleft_seqs.index(hold_v1[i][0])])]):
                            if lev.hamming( v_seqs[k], str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
                                v_match = k
                                # Finds where the end of a full V would be
                                temp_end_v = hold_v1[i][1] + v_ends[v_match] - 1
                                found_v_match += 1
                for i in range(len(hold_v2)):
                    indices = [y for y, x in enumerate(vright_seqs) if x == hold_v2[i][0] ]
                    for k in indices:
                        if len(v_seqs[k]) == len(str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[vright_seqs.index(hold_v2[i][0])])]):
                            if lev.hamming( v_seqs[k], str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1:
                                v_match = k
                                # Finds where the end of a full V would be
                                temp_end_v = hold_v2[i][1] + v_ends[v_match] - 1
                                found_v_match += 1

            if hold_j:
                # Assigns J
                j_match = j_seqs.index(hold_j[0][0])
                # Finds where the start of a full J would be
                temp_start_j = hold_j[0][1] - j_starts[j_match]
                if get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    [ start_j, deletions_j] = get_j_deletions( record.seq, j_match, temp_start_j, j_regions )
            else:
                found_j_match = 0
                hold_j1 = jleft_key.findall(str(record.seq))
                hold_j2 = jright_key.findall(str(record.seq))
                for i in range(len(hold_j1)):
                    indices = [y for y, x in enumerate(jleft_seqs) if x == hold_j1[i][0] ]
                    for k in indices:
                        if len(j_seqs[k]) == len(str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[jleft_seqs.index(hold_j1[i][0])])]):
                            if lev.hamming( j_seqs[k], str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
                                j_match = jleft_seqs.index(hold_j1[i][0])
                                # Finds where the start of a full J would be
                                temp_start_j = hold_j1[i][1] - j_starts[j_match]
                                found_j_match += 1
                for i in range(len(hold_j2)):
                    indices = [y for y, x in enumerate(jright_seqs) if x == hold_j2[i][0] ]
                    for k in indices:
                        if len(j_seqs[k]) == len(str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[jright_seqs.index(hold_j2[i][0])])]):
                            if lev.hamming( j_seqs[k], str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1:
                                j_match = jright_seqs.index(hold_j2[i][0])
                                # Finds where the start of a full J would be
                                temp_start_j = hold_j2[i][1] - j_starts[j_match] - 6
                                found_j_match += 1

            if hold_v and hold_j:
                f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = record.seq[end_v+1:start_j])
                # Write to analysis_file (text file) the classification of the sequence
                print f_seq
                assigned_count += 1
                found_seq_match = 1
            elif hold_v and found_j_match == 1:
                f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                print f_seq
                assigned_count += 1
                found_seq_match = 1
            elif found_v_match == 1 and hold_j:
                f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                print f_seq
                assigned_count += 1
                found_seq_match = 1
            elif found_v_match == 1 and found_j_match == 1:
                f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                print f_seq
                assigned_count += 1
                found_seq_match = 1
            
            #####################
            # REVERSE COMPLEMENT
            #####################
            if found_seq_match == 0 and rev_comp:

                record_reverse = record.reverse_complement()
                hold_v = v_key.findall(str(record_reverse.seq))
                hold_j = j_key.findall(str(record_reverse.seq))

                if hold_v:
                    # Assigns V
                    v_match = v_seqs.index(hold_v[0][0])
                    # Finds where the end of a full V would be
                    temp_end_v = hold_v[0][1] + v_ends[v_match] - 1
                    # If the number of deletions has been found
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ):
                        end_v, deletions_v = get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions )
                else:
                    found_v_match = 0
                    hold_v1 = vleft_key.findall(str(record_reverse.seq))
                    hold_v2 = vright_key.findall(str(record_reverse.seq))
                    for i in range(len(hold_v1)):
                        indices = [y for y, x in enumerate(vleft_seqs) if x == hold_v1[i][0] ]
                        for k in indices:
                            if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[vleft_seqs.index(hold_v1[i][0])])]):
                                if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
                                    v_match = k
                                    # Finds where the end of a full V would be
                                    temp_end_v = hold_v1[i][1] + v_ends[v_match] - 1
                                    found_v_match += 1
                    for i in range(len(hold_v2)):
                        indices = [y for y, x in enumerate(vright_seqs) if x == hold_v2[i][0] ]
                        for k in indices:
                            if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[vright_seqs.index(hold_v2[i][0])])]):
                                if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1:
                                    v_match = k
                                    # Finds where the end of a full V would be
                                    temp_end_v = hold_v2[i][1] + v_ends[v_match] - 1
                                    found_v_match += 1

                if hold_j:
                    # Assigns J
                    j_match = j_seqs.index(hold_j[0][0])
                    # Finds where the start of a full J would be
                    temp_start_j = hold_j[0][1] - j_starts[j_match]
                    if get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        start_j, deletions_j = get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions )
                else:
                    found_j_match = 0
                    hold_j1 = jleft_key.findall(str(record_reverse.seq))
                    hold_j2 = jright_key.findall(str(record_reverse.seq))
                    for i in range(len(hold_j1)):
                        indices = [y for y, x in enumerate(jleft_seqs) if x == hold_j1[i][0] ]
                        for k in indices:
                            if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[jleft_seqs.index(hold_j1[i][0])])]):
                                if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
                                    j_match = jleft_seqs.index(hold_j1[i][0])
                                    # Finds where the start of a full J would be
                                    temp_start_j = hold_j1[i][1] - j_starts[j_match]
                                    found_j_match += 1
                    for i in range(len(hold_j2)):
                        indices = [y for y, x in enumerate(jright_seqs) if x == hold_j2[i][0] ]
                        for k in indices:
                            if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[jright_seqs.index(hold_j2[i][0])])]):
                                if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1:
                                    j_match = jright_seqs.index(hold_j2[i][0])
                                    # Finds where the start of a full J would be
                                    temp_start_j = hold_j2[i][1] - j_starts[j_match] - 6
                                    found_j_match += 1

                if (hold_v and hold_j) or \
                        (hold_v and found_j_match == 1) or \
                        (found_v_match == 1 and hold_j) or \
                        (found_v_match == 1 and found_j_match == 1):
                    
                    f_seq = stemplate.substitute(v = v_match, j = j_match, 
                                del_v = deletions_v, del_j = deletions_j, 
                                nt_insert = str(record_reverse.seq[end_v + 1:start_j]))
                    fields = (v_match, j_match, deletions_v, deletions_j,
                                record_reverse.seq[end_v + 1:start_j])
                    assigned_count += 1
                    found_seq_match = 1
                    print sep.join(map(str, fields))
    if verbose:
        t = time() - t0
        sys.stderr.write('%d sequences were analysed\n' % seq_count)
        sys.stderr.write('%d sequences were successfully assigned\n' % assigned_count)
        sys.stderr.write('%s seconds elapsed\n' % t)
    mouse_proteome_file = [
        x for x in os.listdir(fxn.base_data_dir) if '_mouse.fasta' in x
    ][0]

    mouse_proteins = coll.defaultdict()
    with gzip.open(fxn.base_data_dir + mouse_proteome_file, 'rU') as in_file:
        for protein, seq, blank in fxn.read_fa(in_file):
            mouse_proteins[protein.split(' ')[0]] = seq

    # Then scroll through non-predicted binder files, build an AC trie of all the peptides per file
    data_dir = '../Data/NonPredictedBinders/'
    matches = coll.defaultdict(fxn.nest_counter)
    all_peptides = coll.defaultdict(list)
    for f in [x for x in os.listdir(data_dir) if x.endswith('.txt')]:
        nam = f.split('-')[0]
        search_builder = AcoraBuilder()
        peptides = []

        # Build trie
        with open(data_dir + f, 'rU') as in_file:
            for line in in_file:
                search_builder.add(line.rstrip())
                peptides.append(line.rstrip())
                all_peptides[f.split('-')[0]].append(line.rstrip())
        seq_search = search_builder.build()

        # Use to search all proteins in proteome
        for protein in mouse_proteins:
            seq_check = seq_search.findall(mouse_proteins[protein])
            if seq_check:
                for s in seq_check:
Exemplo n.º 28
0
zy = {'00': 1,
      '01': 1,
      '02': 1,
      '03': 1,
      '10': 1,
      '11': 1,
      '20': 1,
      '22': 1,
      '30': 1,
      '33': 1}

zy = {i: np.log(zy[i]) for i in zy.keys()}

from acora import AcoraBuilder
views = pd.read_csv('View.csv', delimiter='\t', encoding='utf-8')['View']
views = AcoraBuilder(*views)
views = views.build()


def predict(i, data):
    y_pred = data.loc[i, 'predict']
    s = data.loc[i, 'Content'][:maxlen]
    nodes = [dict(zip(['0', '1', '2', '3'], k))
             for k in np.log(y_pred[:len(s)])]
    tags_pred_1 = viterbi(nodes)
    for j in views.finditer(s):
        for k in range(j[1], j[1] + len(j[0])):
            nodes[k]['1'] += 100
            nodes[k]['2'] += 100
            nodes[k]['3'] += 100
        try:
def import_tcr_info(inputargs):
    """ import_tcr_info: Gathers the required TCR chain information for Decombining """

    # Get chain information
    global chain

    chain = get_chain(inputargs)

    #################################################
    ############# GET GENES, BUILD TRIE #############
    #################################################

    print 'Importing TCR', ", ".join(map(chainnams.__getitem__,
                                         chain)), 'gene sequences...'

    # First check that valid tag/species combinations have been used
    if inputargs['tags'] == "extended" and inputargs['species'] == "mouse":
        print "Please note that there is currently no extended tag set for mouse TCR genes.\n \
    Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \
    In future, consider editing the script to change the default, or use the appropriate flags (-sp mouse -tg original)."

        inputargs['tags'] = "original"

    if inputargs['tags'] == "extended" and ('g' in chain or 'd' in chain):

        print "Please note that there is currently no extended tag set for gamma/delta TCR genes.\n \
    Decombinator will now switch the tag set in use from \'extended\' to \'original\' for these chains.\n \
    In future, consider editing the script to change the default, or use the appropriate flags."

        inputargs['tags'] = "original"

    # Set tag split position, and check tag set. Note that original tags use shorter length J half tags, as these tags were originally shorter.

    global v_half_split, j_half_split
    if inputargs['tags'] == "extended":
        v_half_split, j_half_split = [10, 10]
    elif inputargs['tags'] == "original":
        v_half_split, j_half_split = [10, 6]
    else:
        print "Tag set unrecognised; should be either \'extended\' or \'original\' for human, or just \'original\' for mouse. \n \
    Please check tag set and species flag."

        sys.exit()

    # Check species information
    if inputargs['species'] not in ["human", "mouse"]:
        print "Species not recognised. Please select either \'human\' (default) or \'mouse\'.\n \
    If mouse is required by default, consider changing the default value in the script."

        sys.exit()

    # Look for tag and V/J fasta and tag files: if these cannot be found in the working directory, source them from GitHub repositories
    # Note that fasta/tag files fit the pattern "species_tagset_gene.[fasta/tags]"
    # I.e. "[human/mouse]_[extended/original]_TR[A/B/G/D][V/J].[fasta/tags]"

    chain_order = []

    for gene in ['v', 'j']:

        # Get FASTA data
        fasta_holder = []

        for i in range(len(chain)):
            fasta_file = read_tcr_file(inputargs['species'], inputargs['tags'],
                                       chain[i], gene, "fasta",
                                       inputargs['tagfastadir'])
            fasta_holder.append(list(SeqIO.parse(fasta_file, "fasta")))
            fasta_file.close()
            chain
        globals()[gene + "_genes"] = flatten(fasta_holder)

        globals()[gene + "_regions"] = []
        for g in range(0, len(globals()[gene + "_genes"])):
            globals()[gene + "_regions"].append(
                string.upper(globals()[gene + "_genes"][g].seq))

        # Get tag data

        gene_seq_holder = []  #initialise arrays
        half1_gene_seq_holder = []
        half2_gene_seq_holder = []
        jumpfunction_holder = []

        for i in range(len(chain)):
            tag_file = read_tcr_file(inputargs['species'], inputargs['tags'],
                                     chain[i], gene, "tags",
                                     inputargs['tagfastadir'])  # get tag data
            if gene == 'v': jumpfunction = "jump_to_end_v"
            elif gene == 'j': jumpfunction = "jump_to_start_j"
            tag_info_holder = globals()["get_" + gene + "_tags"](
                tag_file, globals()[gene + "_half_split"])
            gene_seq_holder.append(tag_info_holder[0])
            half1_gene_seq_holder.append(tag_info_holder[1])
            half2_gene_seq_holder.append(tag_info_holder[2])
            jumpfunction_holder.append(tag_info_holder[3])
            chain_order.append([chain[i], gene, len(gene_seq_holder[i])])
            tag_file.close()

        globals()[gene + "_seqs"] = flatten(gene_seq_holder)
        globals()["half1_" + gene + "_seqs"] = flatten(half1_gene_seq_holder)
        globals()["half2_" + gene + "_seqs"] = flatten(half2_gene_seq_holder)
        globals()[jumpfunction] = flatten(jumpfunction_holder)

        # Build Aho-Corasick tries
        globals()[gene + "_builder"] = AcoraBuilder()
        for i in range(0, len(globals()[gene + "_seqs"])):
            globals()[gene + "_builder"].add(str(
                globals()[gene +
                          "_seqs"][i]))  # Add all V tags to keyword trie
        globals()[gene + "_key"] = globals()[gene + "_builder"].build()

        # And tries for split, half-tags
        globals()[gene + "_half1_builder"] = AcoraBuilder()
        for i in range(0, len(globals()["half1_" + gene + "_seqs"])):
            globals()[gene + "_half1_builder"].add(
                str(globals()["half1_" + gene + "_seqs"][i]))
        globals()["half1_" + gene +
                  "_key"] = globals()[gene + "_half1_builder"].build()

        globals()[gene + "_half2_builder"] = AcoraBuilder()
        for i in range(0, len(globals()["half2_" + gene + "_seqs"])):
            globals()[gene + "_half2_builder"].add(
                str(globals()["half2_" + gene + "_seqs"][i]))
        globals()["half2_" + gene +
                  "_key"] = globals()[gene + "_half2_builder"].build()

    return chain_order
Exemplo n.º 30
0
 def __init__(self, text):
     self.text = text
     keywords = ["ownership", "owner", "own", "propietary", "tracking", "track", "store", "keep", "keeping"]
     builder = AcoraBuilder()
     builder.add(*keywords)
     self.finder = builder.build()
def import_tcr_info(inputargs):
    """ import_tcr_info: Gathers the required TCR chain information for Decombining """

    # Get chain information
    global chainnams, chain, counts
    counts = coll.Counter()
    chainnams = {"a": "alpha", "b": "beta", "g": "gamma", "d": "delta"}

    # Detect whether chain specified in filename
    inner_filename_chains = [
        x for x in chainnams.values() if x in inputargs['fastq'].lower()
    ]
    if len(inner_filename_chains) == 1:
        counts['chain_detected'] = 1

    if inputargs['chain']:
        if inputargs['chain'].upper() in ['A', 'ALPHA', 'TRA', 'TCRA']:
            chain = "a"
        elif inputargs['chain'].upper() in ['B', 'BETA', 'TRB', 'TCRB']:
            chain = "b"
        elif inputargs['chain'].upper() in ['G', 'GAMMA', 'TRG', 'TCRG']:
            chain = "g"
        elif inputargs['chain'].upper() in ['D', 'DELTA', 'TRD', 'TCRD']:
            chain = "d"
        else:
            print(nochain_error)
            sys.exit()
    else:

        # If no chain provided, try and infer from filename
        if counts['chain_detected'] == 1:
            chain = inner_filename_chains[0][0]

        else:
            nochain_error = "TCR chain not recognised. \n \
      Please either include (one) chain name in the file name (i.e. alpha/beta/gamma/delta),\n \
      or use the \'-c\' flag with an explicit chain option (a/b/g/d, case-insensitive)."

            print(nochain_error)
            sys.exit()

    #################################################
    ############# GET GENES, BUILD TRIE #############
    #################################################

    print('Importing TCR', chainnams[chain], 'gene sequences...')

    # First check that valid tag/species combinations have been used
    if inputargs['tags'] == "extended" and inputargs['species'] == "mouse":
        print(
            "Please note that there is currently no extended tag set for mouse TCR genes.\n \
    Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \
    In future, consider editing the script to change the default, or use the appropriate flags (-sp mouse -tg original)."
        )
        inputargs['tags'] = "original"

    if inputargs['tags'] == "extended" and (chain == 'g' or chain == 'd'):
        print(
            "Please note that there is currently no extended tag set for gamma/delta TCR genes.\n \
    Decombinator will now switch the tag set in use from \'extended\' to \'original\'.\n \
    In future, consider editing the script to change the default, or use the appropriate flags."
        )
        inputargs['tags'] = "original"

    # Set tag split position, and check tag set. Note that original tags use shorter length J half tags, as these tags were originally shorter.
    global v_half_split, j_half_split
    if inputargs['tags'] == "extended":
        v_half_split, j_half_split = [10, 10]
    elif inputargs['tags'] == "original":
        v_half_split, j_half_split = [10, 6]
    else:
        print(
            "Tag set unrecognised; should be either \'extended\' or \'original\' for human, or just \'original\' for mouse. \n \
    Please check tag set and species flag.")
        sys.exit()

    # Check species information
    if inputargs['species'] not in ["human", "mouse"]:
        print(
            "Species not recognised. Please select either \'human\' (default) or \'mouse\'.\n \
    If mouse is required by default, consider changing the default value in the script."
        )
        sys.exit()

    # Look for tag and V/J fasta and tag files: if these cannot be found in the working directory, source them from GitHub repositories
    # Note that fasta/tag files fit the pattern "species_tagset_gene.[fasta/tags]"
    # I.e. "[human/mouse]_[extended/original]_TR[A/B/G/D][V/J].[fasta/tags]"

    for gene in ['v', 'j']:
        # Get FASTA data
        fasta_file = read_tcr_file(inputargs['species'], inputargs['tags'],
                                   gene, "fasta", inputargs['tagfastadir'])
        globals()[gene + "_genes"] = list(SeqIO.parse(fasta_file, "fasta"))

        globals()[gene + "_regions"] = []
        for g in range(0, len(globals()[gene + "_genes"])):
            globals()[gene + "_regions"].append(
                globals()[gene + "_genes"][g].seq.upper())

        # Get tag data
        tag_file = read_tcr_file(inputargs['species'], inputargs['tags'], gene,
                                 "tags",
                                 inputargs['tagfastadir'])  # get tag data
        tag_data = open(tag_file, "r")
        if gene == 'v': jumpfunction = "jump_to_end_v"
        elif gene == 'j': jumpfunction = "jump_to_start_j"
        globals()[gene+"_seqs"], globals()["half1_"+gene+"_seqs"], globals()["half2_"+gene+"_seqs"], globals()[jumpfunction] = \
          globals()["get_"+gene+"_tags"](tag_data, globals()[gene+"_half_split"])
        tag_data.close()

        # Build Aho-Corasick tries
        globals()[gene + "_builder"] = AcoraBuilder()
        for i in range(0, len(globals()[gene + "_seqs"])):
            globals()[gene + "_builder"].add(str(
                globals()[gene +
                          "_seqs"][i]))  # Add all V tags to keyword trie
        globals()[gene + "_key"] = globals()[gene + "_builder"].build()

        # And tries for split, half-tags
        globals()[gene + "_half1_builder"] = AcoraBuilder()
        for i in range(0, len(globals()["half1_" + gene + "_seqs"])):
            globals()[gene + "_half1_builder"].add(
                str(globals()["half1_" + gene + "_seqs"][i]))
        globals()["half1_" + gene +
                  "_key"] = globals()[gene + "_half1_builder"].build()

        globals()[gene + "_half2_builder"] = AcoraBuilder()
        for i in range(0, len(globals()["half2_" + gene + "_seqs"])):
            globals()[gene + "_half2_builder"].add(
                str(globals()["half2_" + gene + "_seqs"][i]))
        globals()["half2_" + gene +
                  "_key"] = globals()[gene + "_half2_builder"].build()
Exemplo n.º 32
0
import json
import linecache
import os
import re

import jieba
import numpy as np
from acora import AcoraBuilder

from emotion_cla.emo_cls import classify
from emotion_cla.separate import separate

in_dir = 'data/tweet'
out_dir = 'data/tweet_emo'
builder = AcoraBuilder([line.strip() for line in open('data/emoji.txt')])
ac = builder.build()


def load_labelled():
    lines = set()
    for i in range(5):
        for line in open('data/content_3000/{}.txt'.format(i)):
            lines.add(line.strip())
    return lines


# have_lines = load_labelled()


def random_ids(in_name, out_name, lens):
    '''
    for key, values in output_dict.items():  # remove last ", "
        output_dict[key] = values[:-2]

    return output_dict


if __name__ == "__main__":
    args = parsing_argument()

    if not args.source:
        raise Exception("Please input the source file")
    with open(args.source, 'r') as file:
        keywords = file.read().splitlines()  # Reading the source file

    ac = AcoraBuilder(keywords)
    ac = ac.build()  # build the model for searching the keywords

    # Reading the target files
    if args.target_files:
        with open(args.target_files, 'r') as file:
            target_files = file.read().splitlines()
            target_file = [
                target_file for target_file in target_files
                if ".pdf" in target_file or ".html" in target_file
            ]
    else:
        target_files = [
            os.path.join(paths, file)
            for paths, _, files in os.walk(args.target_folder)
            for file in files if '.pdf' in file or '.html' in file
def analysis( Sequence_Reads, with_statistics=True, with_reverse_complement_search=True):
    import numpy as np
    import decimal as dec
    import string
    import operator as op
    import collections as coll
    from Bio import SeqIO
    from acora import AcoraBuilder
    from time import time, clock
    from string import Template
    from operator import itemgetter, attrgetter
    import Levenshtein as lev

    v_half_split, j_half_split = [10,6] # Do not change - V tags are split at position 10, J at position 6, to look for half tags if no full tag is found.

    ################

    print 'Commencing analysis on a total of', len(Sequence_Reads), 'file(s)'

    ## Create .txt file to store f=(v_index,j_index,v_deletions,j_deletions,nt_insert)
    analysis_file = open("DecombinatorResults.txt", "w")
    analysis_file.close()
    results = "DecombinatorResults.txt" # Name the .txt file to write to

    ################
    print ('Importing known V, D and J gene segments and tags...')

    handle = open("human_TRBV_region.fasta", "rU")
    v_genes = list(SeqIO.parse(handle, "fasta"))
    handle.close()

    handle = open("human_TRBJ_region.fasta", "rU")
    j_genes = list(SeqIO.parse(handle, "fasta"))
    handle.close()

    v_regions = []
    for j in range(0, len(v_genes)):
        v_regions.append(string.upper(v_genes[j].seq))

    j_regions = []
    for j in range(0, len(j_genes)):
        j_regions.append(string.upper(j_genes[j].seq))

    ##############
    ## Build keyword tries of V and J tags for fast assignment
    v_seqs, half1_v_seqs, half2_v_seqs, jump_to_end_v = get_v_tags(open("tags_trbv.txt", "rU"), v_half_split)
    j_seqs, half1_j_seqs, half2_j_seqs, jump_to_start_j = get_j_tags(open("tags_trbj.txt", "rU"), j_half_split)   

    v_builder = AcoraBuilder()
    for i in range(0,len(v_seqs)):
        v_builder.add(str(v_seqs[i])) # Add all V tags to keyword trie

    v_key = v_builder.build()

    j_builder = AcoraBuilder()
    for i in range(0,len(j_seqs)):
        j_builder.add(str(j_seqs[i])) # Add all J tags to keyword trie

    j_key = j_builder.build()

    ##############
    ## Build keyword tries for first and second halves of both V and J tags
    v_half1_builder = AcoraBuilder()
    for i in range(0,len(half1_v_seqs)):
        v_half1_builder.add(str(half1_v_seqs[i]))
    half1_v_key = v_half1_builder.build()

    v_half2_builder = AcoraBuilder()
    for i in range(0,len(half2_v_seqs)):
        v_half2_builder.add(str(half2_v_seqs[i]))
    half2_v_key = v_half2_builder.build()

    j_half1_builder = AcoraBuilder()
    for i in range(0,len(half1_j_seqs)):
        j_half1_builder.add(str(half1_j_seqs[i]))
    half1_j_key = j_half1_builder.build()

    j_half2_builder = AcoraBuilder()
    for i in range(0,len(half2_j_seqs)):
        j_half2_builder.add(str(half2_j_seqs[i]))
    half2_j_key = j_half2_builder.build()

    ###############
    ## Initialise variables
    assigned_count = 0 # this will just increase by one every time we correctly assign a seq read with all desired variables
    seq_count = 0 # this will simply track the number of sequences analysed in file
    t0 = time() # Begin timer

    ###############
    ## Open .txt file created at the start of analysis
    analysis_file = open(results, "a")
    stemplate = Template('$v $j $del_v $del_j $nt_insert') # Creates stemplate, a holder, for f. Each line will have the 5 variables separated by a space

    ###############
    ## Begin analysing sequences

    for i in range(len(Sequence_Reads)):
        
        print 'Importing sequences from', Sequence_Reads[i],' and assigning V and J regions...'
        handle = open(Sequence_Reads[i], "rU")
        
        for record in SeqIO.parse(handle, "fastq"):
            
            found_seq_match = 0
            seq_count += 1
            
            hold_v = v_key.findall(str(record.seq))
            hold_j = j_key.findall(str(record.seq))

            if hold_v:                
                v_match = v_seqs.index(hold_v[0][0]) # Assigns V
                temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found
                    [ end_v, deletions_v] = get_v_deletions( record.seq, v_match, temp_end_v, v_regions )
            else:
                found_v_match = 0
                hold_v1 = half1_v_key.findall(str(record.seq))
                hold_v2 = half2_v_key.findall(str(record.seq))
                for i in range(len(hold_v1)):
                    indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ]
                    for k in indices:
                        if len(v_seqs[k]) == len(str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]):
                            if lev.hamming( v_seqs[k], str(record.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
                                v_match = k
                                temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                found_v_match += 1
                for i in range(len(hold_v2)):
                    indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ]
                    for k in indices:
                        if len(v_seqs[k]) == len(str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]):
                            if lev.hamming( v_seqs[k], str(record.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1:
                                v_match = k
                                temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                found_v_match += 1

            if hold_j:
                j_match = j_seqs.index(hold_j[0][0]) # Assigns J
                temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                if get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    [ start_j, deletions_j] = get_j_deletions( record.seq, j_match, temp_start_j, j_regions )
            else:
                found_j_match = 0
                hold_j1 = half1_j_key.findall(str(record.seq))
                hold_j2 = half2_j_key.findall(str(record.seq))
                for i in range(len(hold_j1)):
                    indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ]
                    for k in indices:
                        if len(j_seqs[k]) == len(str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]):
                            if lev.hamming( j_seqs[k], str(record.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
                                j_match = half1_j_seqs.index(hold_j1[i][0])
                                temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                                found_j_match += 1
                for i in range(len(hold_j2)):
                    indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ]
                    for k in indices:
                        if len(j_seqs[k]) == len(str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]):
                            if lev.hamming( j_seqs[k], str(record.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1:
                                j_match = half2_j_seqs.index(hold_j2[i][0])
                                temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be
                                found_j_match += 1

            if hold_v and hold_j:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence
                    assigned_count += 1
                    found_seq_match = 1
            elif hold_v and found_j_match == 1:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq
                    assigned_count += 1
                    found_seq_match = 1
            elif found_v_match == 1 and hold_j:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq
                    assigned_count += 1
                    found_seq_match = 1
            elif found_v_match == 1 and found_j_match == 1:
                if get_v_deletions( record.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record.seq, j_match, temp_start_j, j_regions ):
                    f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record.seq[end_v+1:start_j]))
                    print >> analysis_file, f_seq
                    assigned_count += 1
                    found_seq_match = 1

            if found_seq_match == 0 and with_reverse_complement_search == True:
                
                #####################
                # REVERSE COMPLEMENT
                #####################

                record_reverse = record.reverse_complement()
                hold_v = v_key.findall(str(record_reverse.seq))
                hold_j = j_key.findall(str(record_reverse.seq))

                if hold_v:                
                    v_match = v_seqs.index(hold_v[0][0]) # Assigns V
                    temp_end_v = hold_v[0][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ): # If the number of deletions has been found
                        [ end_v, deletions_v] = get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions )
                else:
                    found_v_match = 0
                    hold_v1 = half1_v_key.findall(str(record_reverse.seq))
                    hold_v2 = half2_v_key.findall(str(record_reverse.seq))
                    for i in range(len(hold_v1)):
                        indices = [y for y, x in enumerate(half1_v_seqs) if x == hold_v1[i][0] ]
                        for k in indices:
                            if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[half1_v_seqs.index(hold_v1[i][0])])]):
                                if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v1[i][1]:hold_v1[i][1]+len(v_seqs[k])] ) <= 1:
                                    v_match = k
                                    temp_end_v = hold_v1[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                    found_v_match += 1
                    for i in range(len(hold_v2)):
                        indices = [y for y, x in enumerate(half2_v_seqs) if x == hold_v2[i][0] ]
                        for k in indices:
                            if len(v_seqs[k]) == len(str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[half2_v_seqs.index(hold_v2[i][0])])]):
                                if lev.hamming( v_seqs[k], str(record_reverse.seq)[hold_v2[i][1]:hold_v2[i][1]+len(v_seqs[k])] ) <= 1:
                                    v_match = k
                                    temp_end_v = hold_v2[i][1] + jump_to_end_v[v_match] - 1 # Finds where the end of a full V would be
                                    found_v_match += 1

                if hold_j:
                    j_match = j_seqs.index(hold_j[0][0]) # Assigns J
                    temp_start_j = hold_j[0][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                    if get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        [ start_j, deletions_j] = get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions )
                else:
                    found_j_match = 0
                    hold_j1 = half1_j_key.findall(str(record_reverse.seq))
                    hold_j2 = half2_j_key.findall(str(record_reverse.seq))
                    for i in range(len(hold_j1)):
                        indices = [y for y, x in enumerate(half1_j_seqs) if x == hold_j1[i][0] ]
                        for k in indices:
                            if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[half1_j_seqs.index(hold_j1[i][0])])]):
                                if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j1[i][1]:hold_j1[i][1]+len(j_seqs[k])] ) <= 1:
                                    j_match = half1_j_seqs.index(hold_j1[i][0])
                                    temp_start_j = hold_j1[i][1] - jump_to_start_j[j_match] # Finds where the start of a full J would be
                                    found_j_match += 1
                    for i in range(len(hold_j2)):
                        indices = [y for y, x in enumerate(half2_j_seqs) if x == hold_j2[i][0] ]
                        for k in indices:
                            if len(j_seqs[k]) == len(str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[half2_j_seqs.index(hold_j2[i][0])])]):
                                if lev.hamming( j_seqs[k], str(record_reverse.seq)[hold_j2[i][1]:hold_j2[i][1]+len(j_seqs[k])] ) <= 1:
                                    j_match = half2_j_seqs.index(hold_j2[i][0])
                                    temp_start_j = hold_j2[i][1] - jump_to_start_j[j_match] - 6 # Finds where the start of a full J would be
                                    found_j_match += 1

                if hold_v and hold_j:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq # Write to analysis_file (text file) the classification of the sequence
                        assigned_count += 1
                        found_seq_match = 1
                elif hold_v and found_j_match == 1:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq
                        assigned_count += 1
                        found_seq_match = 1
                elif found_v_match == 1 and hold_j:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq
                        assigned_count += 1
                        found_seq_match = 1
                elif found_v_match == 1 and found_j_match == 1:
                    if get_v_deletions( record_reverse.seq, v_match, temp_end_v, v_regions ) and get_j_deletions( record_reverse.seq, j_match, temp_start_j, j_regions ):
                        f_seq = stemplate.substitute( v = v_match, j = j_match, del_v = deletions_v, del_j = deletions_j, nt_insert = str(record_reverse.seq[end_v+1:start_j]))
                        print >> analysis_file, f_seq
                        assigned_count += 1
                        found_seq_match = 1
        handle.close()
    analysis_file.close()

    if with_statistics == True:
        timed = time() - t0
        print seq_count, 'sequences were analysed'
        print assigned_count, ' sequences were successfully assigned'
        print 'Time taken =', timed, 'seconds'