Exemplo n.º 1
0
    def filter_func(row):
        qrec = row.query_record
        qrec = SeqIO.read(StringIO(qrec), 'fasta')
        min_len = 0.25 * len(qrec)
        intro = row.iloc[0:6].tolist()
        hits = row.iloc[5:-1].tolist()
        new_hits = []
        for hit in hits:

            if hit == 'NA':
                new_hits.append(None)
                continue
            elif hit is not None:
                tmp = '>' + hit
            else:
                new_hits.append(None)
                continue
            hit = SeqIO.read(StringIO(tmp), 'fasta')
            id_lst = hit.id
            _, hit_symbol, seq_range, _ = id_search(id_lst, id_type='brute', verbose=0)
            try:
                seq_range = seq_range[hit_symbol]
            except KeyError:
                new_hits.append(None)
                continue
            seq_len = abs(int(seq_range[1]) - int(seq_range[0]))
            new_hits.append(hit.description if seq_len >= min_len else None)
        full = intro + new_hits
        return full
Exemplo n.º 2
0
def map_ranges(hit):
    """ Convenience function for RBC.results_map(). Replaces results with a tup of result descriptions and loci."""
    _, h_id, h_range, _ = id_search(hit.description, verbose=False)
    h_start = h_range[0]
    h_end = h_range[1]
    h_strand = h_range[2]
    h_d = (hit.description, h_id, h_start, h_end, h_strand)
    return h_d
Exemplo n.º 3
0
 def fasta(id_item, seq_range, database, source, indent, verbose):
     regex = id_search(id_item,
                       indent=indent,
                       verbose=verbose,
                       regex_only=True)
     seqdict = SeqIO.index(database,
                           source,
                           key_function=lambda identifier: regex.search(
                               identifier).groups()[0])
     itemnotfound = id_item if id_item not in seqdict.keys() else None
     seq = seqdict[id_item]
     seq = seq[slice(seq_range[0], seq_range[1])]
     return seq, itemnotfound
Exemplo n.º 4
0
    def fun(self, hit, stat, verbose=False):

        pat = re.compile('\|\[(.*?):.*\]\|')  # regex for items in annotation
        try:
            hit_split = hit.description.split('|-|')
            top_anno = hit_split[1]
        except ValueError:
            print(hit.description, indent=2)
            print('Could not unpack annotations!', indent=2)
            return False
        except IndexError:
            print(hit.description, indent=2)
            print('Could not unpack annotations!', indent=2)
            return False
        id_lst = pat.findall(top_anno)[0].strip()
        if id_lst:
            _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=verbose)

            if stat == hit_symbol:
                return True
        else:
            return False
Exemplo n.º 5
0
def count_reciprocal_best_hits_from_pandas(pandas_df):
    pat = re.compile('\|\[(.*?)\]\|')  # regex for items in annotation
    spec_list = list(pandas_df.target_species.unique())
    species_counters = {}
    for species in spec_list:
        species_counters[species] = Counter()
        species_results = pandas_df.loc[pandas_df['target_species'] == species]
        query_list = list(species_results.query_name.unique())
        for query in query_list:
            print(query)
            query_results = species_results.loc[species_results['query_name'] == query].ix[:, 5:-1]
            rc_out = []
            for i, d in query_results.iterrows():
                rc_out += d.tolist()
            # Annoying shunt
            rc_out_asfasta = '\n'.join(['>' + i for i in rc_out if i is not None])
            tmp = StringIO(rc_out_asfasta)
            rc_out = SeqIO.parse(tmp, 'fasta')
            for hit in rc_out:
                try:
                    hit_split = hit.description.split('|-|')
                    id_lst = ''.join(pat.findall(hit_split[1]))
                except ValueError:
                    print(hit.description, indent=2)
                    print('Could not unpack annotations!', indent=2)
                    continue
                if id_lst:
                    _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=0)

                else:
                    print('No annotations found for record {0} in species {1}, query {2}'.format(hit.name,
                                                                                                 species,
                                                                                                 query))
                    continue
                if query == hit_symbol:
                    species_counters[species].update({query: 1})
    return species_counters
Exemplo n.º 6
0
def count_reciprocal_best_hits(recblast_out):
    pat = re.compile('\|\[(.*?)\]\|')  # regex for items in annotation
    species_counters = {}
    for species, species_dict in recblast_out.items():
        species_counters[species] = Counter()
        for query, query_dict in species_dict.items():
            try:
                rc_out = query_dict['recblast_results']
            except KeyError:
                print('No entries in recblast_results for query {0} in species {1}'.format(query, species))
                continue
            for hit in rc_out:
                try:
                    hit_split = hit.description.split('|-|')
                    target_id = hit_split[0]
                    annotations = hit_split[1]
                except ValueError:
                    print(hit.description, indent=2)
                    print('Could not unpack annotations!', indent=2)
                    continue
                except IndexError:
                    print(hit.description, indent=2)
                    print('Could not unpack annotations!', indent=2)
                    continue
                id_lst = ''.join(pat.findall(annotations))
                if id_lst:
                    _, hit_symbol, _, _ = id_search(id_lst, id_type='symbol', verbose=0)

                else:
                    print('No annotations found for record {0} in species {1}, query {2}'.format(hit.name,
                                                                                                 species,
                                                                                                 query))
                    continue

                if query == hit_symbol:
                    species_counters[species].update({query: 1})
    return species_counters
Exemplo n.º 7
0
def simple_struct(recblast_out, verbose=True):
    """Returns a nice diagram of queries, targets, and annotations"""
    master_dict = {}
    pat = re.compile('\|\[(.*?)\]\|')  # regex for items in annotation
    if isinstance(recblast_out, list):
        # Prepare a list of dictionaries of length recblast_out, along with a list of respective species
        master_count = [dict] * len(recblast_out)

        for index, rc in enumerate(recblast_out):
            try:
                master_count[index] = simple_struct(rc)
            except AttributeError:
                master_count[index] = rc
        for subdict in master_count:
            for species, species_dict in subdict.items():
                if isinstance(species_dict, Exception):
                    continue
                try:
                    comb_spec_dict = master_dict[species]
                except KeyError:
                    master_dict[species] = dict()
                    comb_spec_dict = master_dict[species]
                for query, query_dict in species_dict.items():
                    try:
                        comb_query_dict = comb_spec_dict[query]
                    except KeyError:
                        comb_spec_dict[query] = dict()
                        comb_query_dict = comb_spec_dict[query]
                    for target_id, annotation_list in query_dict.items():
                        try:
                            comb_anno_list = comb_query_dict[target_id]
                        except KeyError:
                            comb_query_dict[target_id] = list()
                            comb_anno_list = comb_query_dict[target_id]
                        comb_anno_list += annotation_list if isinstance(annotation_list, list) else [annotation_list]
        return master_dict

    else:
        """
        Structure:
            master_dict:
                Species|    species_dict:
                                Query|  query_dict:
                                            target_id|  annotations_list
        """
        # assert isinstance(recblast_out, RecBlastContainer), 'Item in recblast_out was not a RecBlastContainer object!'
        try:
            recblast_out.__delitem__('__dict__')
        except KeyError:
            pass
        for species, rc_spec_rec in recblast_out.items():
            # print('Species:\t', species, indent=0)
            try:
                species_dict = master_dict[species]
            except KeyError:
                master_dict[species] = dict()
                species_dict = master_dict[species]
            for query, rc_rec in rc_spec_rec.items():
                # print('Query:\t', query, indent=1)
                try:
                    query_dict = species_dict[query]
                except KeyError:
                    species_dict[query] = dict()
                    query_dict = species_dict[query]
                try:
                    rc_out = rc_rec['recblast_results']
                except KeyError:
                    print('No entries in recblast_results for query {0} in species {1}'.format(query, species))
                    continue
                for record in rc_out:
                    try:
                        # print(record.description, indent=3)
                        target_id, annotations = record.description.split('|-|')
                        # print('Target ID:\t', target_id, indent=4)
                        # print('Annotations:', annotations.lstrip('\t'), indent=4)
                    except ValueError:
                        print(record.description, indent=2)
                        # print('Could not unpack annotations!', indent=2)
                        continue
                    try:
                        target_list = query_dict[target_id]
                    except KeyError:
                        query_dict[target_id] = list()
                        target_list = query_dict[target_id]
                    id_lst = pat.findall(annotations)
                    # print('id_list:\t', id_lst, indent=4)
                    if id_lst:
                        target_list += id_lst
                    else:
                        print('No annotations found for record {0} in species {1}, query {2}'.format(record.name,
                                                                                                     species,
                                                                                                     query))
        if verbose:
            print('*******************************************')
            for species, species_dict in master_dict.items():
                print(species, indent=0)
                for query, query_dict in species_dict.items():
                    print(query, indent=1)
                    for target_id, annotation_list in query_dict.items():
                        print(target_id, indent=2)
                        tmp = []
                        for annotation in annotation_list:
                            p, item, seq_range, id_type = id_search(annotation, id_type='brute', verbose=0)
                            if id_type == 'symbol':
                                tmp.append(item)
                            else:
                                tmp.append(item)
                        query_dict[target_id] = tmp
                        for annotation in query_dict[target_id]:
                            print(annotation, indent=3)
            print('*******************************************')
        return master_dict