Exemplo n.º 1
0
def get_islands(species_name):
    islands = []
    species_hits_list = []
    
    # Add mongo_record for each hit in any gene
    all_hits = kv.get_collection('hits')

    
    for query_id in species_hits:
        if species_hits[query_id]:
            species_hits_list.append(
                kv.get_mongo_record(species_name, query_id)
                )

    for entry_1 in species_hits_list:
        entry_recorded = False
        for entry_2 in species_hits_list:
            if entry_1 == entry_2:
                pass
            elif entry_1['location']['contig'] != entry_2['location']['contig']:
                pass
            else:
                location_1 = entry_1['location']
                location_2 = entry_2['location']
                if abs(location_1['end'] - location_2['start']) <= 5000:
                    entry_recorded = True
                    islands.append([
                        (entry_1['species'], str(entry_1['_id'])),
                        (entry_2['species'], str(entry_2['_id']))
                    ])
        if not entry_recorded:
            islands.append([(entry_1['species'], str(entry_1['_id']))])

    return collapse_lists(islands)
Exemplo n.º 2
0
def output_compare_matrix():
    groups = get_groups()
    all_species = get_tree()[1]
    print all_species
    
    cds_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species)
    nt_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species)
    groups_df = pd.DataFrame(data={n:0 for n in all_species}, index=all_species, columns=all_species)

    for pair in combinations(all_species, 2):
        print "====\nComparing {} and {}".format(pair[0], pair[1])
        shared_cds, shared_nt = pair_compare(pair[0],pair[1])
        shared_groups = 0
        if kv.get_genus(pair[0]) == kv.get_genus(pair[1]):
            print "Oops! They're the same genus... moving on\n===="
            continue
        elif shared_cds == 0:
            print "Oops! Looks like they don't share anything... moving on\n===="
            continue
        else:
            for group in get_groups():           
                if [x for x in group if x[0] == pair[0] and any(y[0] == pair[1] for y in group)]:
                    shared_groups +=1

            cds_df[pair[0]][pair[1]] = cds_df[pair[1]][pair[0]] = shared_cds
            nt_df[pair[0]][pair[1]] = nt_df[pair[1]][pair[0]] = shared_nt
            groups_df[pair[0]][pair[1]] = groups_df[pair[1]][pair[0]] = shared_groups
            print "shared cds: {}\nshared nt: {}\nshared groups: {}\n====".format(shared_cds, shared_nt, shared_groups)

    print nt_df

    cds_df.to_csv('cds_matrix.csv'.format(kv.db.name))
    nt_df.to_csv('nt_matrix.csv'.format(kv.db.name))
    groups_df.to_csv('groups_matrix.csv' .format(kv.db.name))
Exemplo n.º 3
0
def core_hgt_blast(perc_identity='99'):
    """
    Blasts all core genomes against core db
    - Set `perc_identity` if desired (default = 99)
    """
    if not os.path.isdir('blast_results/core/'):
        os.makedirs('blast_results/core/')
    for species in kv.get_collection('core').distinct('species'):
        query_fasta = 'blast_results/core/{}_tmp.fna'.format(species)
        
        with open(query_fasta, 'w+') as query_handle:
            for query in kv.get_collection('core').find({'species':species}):
                if query['type'] == 'gene':
                    query_handle.write('>{0}|{1}\n{2}\n'.format(
                        query['species'],
                        query['_id'],
                        query['dna_seq']
                        )
                    )
        
        print 'Blasting {0}'.format(species)
        out = Popen(
            ['blastn',
            '-query', query_fasta,
            '-db', 'blast_databases/core',
            '-outfmt', '5',
            '-out', 'blast_results/core/{}_{}_blast.xml'.format(species, perc_identity),
            '-perc_identity', perc_identity
            ],
            stdout=PIPE
        ).communicate()[0]

        os.remove(query_fasta)
Exemplo n.º 4
0
def ssu_fasta():
    with open('16s.fna', 'w+') as out_handle:
        for species in kv.get_collection('16S').distinct('species'):
            ssu = kv.get_collection('16S').find_one({'species':species})
            if ssu:
                out_handle.write(kv.make_gene_fasta(ssu, to_file=False))
            else:
                print species
Exemplo n.º 5
0
def other_blast():
    groups_list = core_hgt_groups()
    groups_list.sort(key=len, reverse=True)
    
    for i in range(len(groups_list)):
        group_hits = []
        kv.make_id_list_fasta(groups_list[i], 'core')
        results = blast_vs_db('tmp.fna', 'blast_databases/other')

        hits_collection = kv.get_collection('hits')
        if results:
            for j in range(len(results)):
                group_hits.append(results[j])
        hits_collection.insert_one({'group':(i+1), 'group_hits':group_hits})
Exemplo n.º 6
0
def make_indexed_fasta(species):
    if not os.path.isdir('fastas/'):
        os.makedirs('fastas/')
    id_list =[]
    indexed_species = kv.index_contigs(kv.get_collection(species))
    indexed_species2 = kv.index_contigs(kv.get_collection(species))
    fasta = 'fastas/{}_indexed.fna'.format(species)
    for record in indexed_species:
        id_list.append(record['_id'])
    if not os.path.isfile(fasta):
        with open(fasta, 'w+') as output_handle:
            for record in indexed_species2:
                output_handle.write(
                    ">{}|{}\n{}\n".format(record['species'].replace(' ', '_'), record['_id'], record['dna_seq'])
                )
Exemplo n.º 7
0
def pair_compare(species_1, species_2):    
    shared_CDS = 0
    shared_nt = 0

    s1_genes = kv.get_collection('hits').find_one({'species':species_1})
    
    for gene in s1_genes['hits']:
        if s1_genes['hits'][gene]:
            for hit in s1_genes['hits'][gene]:
                if hit[0] == species_2:
                    shared_CDS += 1
                    species_2_record = kv.get_mongo_record(hit[0],hit[1])
                    hit_loc = species_2_record['location']
                    shared_nt += hit_loc['end'] - hit_loc['start']
    return shared_CDS, shared_nt
Exemplo n.º 8
0
def get_groups():
    all_hits = kv.get_collection('hits')
    groups_list = []
    for h in all_hits.find():
        current_species = h['species']

        # if any([x in current_species for x in dutton_list] or [y in current_species for y in wolfe_list]):
        current_species_islands = get_islands(h['species'])
        
        # each sublist represents one island...
        for island in current_species_islands:
            hit_set = set() # container for hits 
            for gene_id in island:
                gene_hits = h['hits'][gene_id[1]]
                
                # Pulls each hit id tuple, then appends it to group_set
                for hit in gene_hits:
                    hit_set.add((hit[0], hit[1]))
            # add id tuples for hits to island list...
            island.update(hit_set)
            # And add new island (with multiple species) to groups_list
            groups_list.append(list(island))

    # Since each species' islands are built independently, there's a lot of redundancy
    # So... Collapse lists that contain shared elements and deduplicate
    return map(list, collapse_lists(groups_list))
Exemplo n.º 9
0
def get_gc(some_gbk):
    """
    Get GC content for contigs in genbank file, output file for circs line plot

    """    
    with open (some_gbk, 'r') as file_handle:
        gc_points = []
        sp_name = None
        for record in SeqIO.parse(file_handle, 'gb'):
            for b in range(len(record))[500::1000]:
                gc_cont = SeqUtils.GC(record.seq[b-500:b+499])
                gc_points.append((record.name, b-500, b+499, gc_cont))
            
            sp_name = kv.parse_genbank_name(some_gbk)
        sp_strain = sp_name[2]
        with open('circos/GC/gc_{}.txt'.format(os.path.basename(some_gbk)[:-13]), 'w+') as out_handle:
            values = [x[3] for x in gc_points]
            stats = (min(values), max(values), np.average(values), np.std(values))
            out_handle.write("# Min: {}\n# Max: {}\n# Avg, Std: {}, {}\n".format(
                stats[0], stats[1], stats[2], stats[3]
                )
            )
            for point in gc_points:
                out_handle.write('{0}{1} {2} {3} {4}\n'.format(
                    sp_strain,
                    point[0],
                    point[1],
                    point[2],
                    point[3]
                    )
                )
            return stats
Exemplo n.º 10
0
def core_hgt_groups(perc_identity='99'):
    """
    Returns mutilspecies groups of genes as list of lists.
    
    - Starts with species islands `get_islands()`
    - For each island, if a hit from that island is in another island,
      group them together
    """
    all_hits = kv.get_collection('hits')
    groups_list = []
    for s in all_hits.distinct('species'):
        s_hits = all_hits.find_one({'species':s})
        current_species_islands = get_islands(s_hits['species'])
        
        # each sublist represents one island...
        for island in current_species_islands:
            if island: # many lists are empty, skip those
                hit_set = set() # container for hits 
                for gene_id in island:
                    gene_hits = s_hits['core_hits_{}'.format(perc_identity)][gene_id[1]]
                    
                    # Pulls each hit id tuple, then appends it to group_set
                    for hit in gene_hits:
                        hit_set.add((hit[0], hit[1]))
                # add id tuples for hits to island list...
                island.update(hit_set)
                # And add new island (with multiple species) to groups_list
                groups_list.append(list(island))

    # Since each species' islands are built independently, there's a lot of redundancy
    # So... Collapse lists that contain shared elements and deduplicate
    return map(list, collapse_lists(groups_list))
Exemplo n.º 11
0
def get_karyotype(some_gbk):
    """
    Convert Genbank file into Karyotype file for Circos
    - Each contig is a "chromosome"
    - format: 'chr - ID LABEL START END COLOR'
    """
    with open (some_gbk, 'r') as file_handle:
        contigs = []
        sp_name = None
        for record in SeqIO.parse(file_handle, 'gb'):
            sp_name = kv.parse_genbank_name(some_gbk)
            contigs.append((record.name, len(record)))
        sp_strain = sp_name[2]
        if not os.path.isdir('circos/karyotypes/'):
            os.makedirs('circos/karyotypes/')
        with open('circos/karyotypes/karyotype_{}.txt'.format(os.path.basename(some_gbk)[:-13]), 'w+') as karyotype:
            color = [np.random.randint(0,255), np.random.randint(0,255), np.random.randint(0,255)]
            for contig in contigs:
                if contig[1] > 1000:
                    karyotype.write('chr - {0}{1} {2} {3} {4} {5},{6},{7}\n'.format(
                        sp_strain,
                        contig[0],
                        contig[0],
                        '1',
                        contig[1],
                        *color
                        )
                    )
                else:
                    break
Exemplo n.º 12
0
def core_hgt_stats(perc_identity='99'):
    """
    Returns stats of HGT (number of events etc)
    """
    collection = kv.get_collection('core')
    df_index = ['Total_CDS', 'HGT_CDS', 'Islands']
    df = pd.DataFrame()
    for species in collection.distinct('species'):
        hits = kv.get_collection('hits').find_one({'species':species})['core_hits_{}'.format(perc_identity)]
        series = pd.Series([
            sum([1 for x in collection.find({'species':species})]),
            sum([1 for x in hits if hits[x]]),
            len(get_islands(species))
        ], name=species, index=df_index)
        df = df.append(series)

    df.to_csv('stats.csv',  columns=df_index)
Exemplo n.º 13
0
def get_tree(core=False, newick=False):
    all_species = kv.get_collection('core').distinct('species')
    if core:
        pass
    else:
        all_species.extend(kv.get_collection('other').distinct('species'))
    t = tree.nj(dm)
    print t.ascii_art()
    tips = []
    for node in t.tips():
        print node.name, node.length
        tips.append(node.name.replace(' ', '_'))
    if newick:
        n = tree.nj(dm, result_constructor=str)
        print n
    else:
        return (t, tips)
Exemplo n.º 14
0
def output_distance_matrix(core=False, to_file=True):
    all_species = kv.get_collection('core').distinct('species'))
    if core:
        pass
    else:
        all_species.extend(kv.get_collection('other').distinct('species'))

    distance_matrix = pd.DataFrame(data={n:0.0 for n in all_species}, index=all_species)
    
    for pair in combinations_with_replacement(all_species, 2):
        distance = get_16S_distance(pair[0], pair[1])
        distance_matrix[pair[0]][pair[1]] = distance
        distance_matrix[pair[1]][pair[0]] = distance

    if to_file:
        distance_matrix.to_csv('distance_matrix.csv')
    else:
        return distance_matrix
Exemplo n.º 15
0
def make_species_fasta(species):
    if not os.path.isdir('fastas'):
        os.makedirs('fastas')    
    fasta = 'fastas/{}.fna'.format(species)
    if not os.path.isfile(fasta):
        with open(fasta, 'w+') as output_handle:
            for record in kv.get_collection(species).find():
                output_handle.write(
                    ">{}|{}\n{}\n".format(record['species'].replace(' ', '_'), record['_id'], record['dna_seq'])
                )
Exemplo n.º 16
0
def output_all_16S():
    print "Making fasta of all 16S in database {}".format(kv.db.name)
    with open('{}_16S.fna'.format(kv.db.name), 'w+') as output_handle:
        for record in kv.get_collection('16S').find():
            output_handle.write(
                '>{0}\n{1}\n'.format(
                    record['species'],
                    record['dna_seq'],
                    )
                )
Exemplo n.º 17
0
def blast_to_db(db='core', perc_identity='99'):
    blast_dir = 'blast_results/{}/'.format(db)
    for f in os.listdir(blast_dir):
        if f.endswith('{}_blast.xml'.format(perc_identity)):
            file_handle = 'blast_results/{}/{}'.format(db,f)
            with open(file_handle, 'r') as result_handle:
                blast_records = NCBIXML.parse(result_handle)
                hits_dict = {}
                for blast_record in blast_records:
                    query_parse = re.search(r'(\w+)\|(\w+)', blast_record.query)
                    query_genus_parse = re.match(r'([A-Za-z]+)_', blast_record.query)
                    query_genus = query_genus_parse.group(1)
                    query_name = query_parse.group(1)
                    query_id = query_parse.group(2)

                    hits_dict[query_id] = []

                    for alignment in blast_record.alignments:
                        hit_parse = re.search(r'(\w+)\|(\w+)', alignment.hit_def)
                        hit_genus_parse = re.match(r'([A-Za-z]+)_', alignment.hit_def)
                        hit_genus = hit_genus_parse.group(1)

                        hit_name = hit_parse.group(1)
                        hit_id = hit_parse.group(2)
                        if query_name == hit_name:
                            pass
                        elif query_genus == hit_genus:
                            print "Oops! {} and {} are the same genus, skipping...".format(query_name, hit_name)
                            pass
                        elif kv.get_mongo_record(hit_name, hit_id)['type'] == '16S':
                            print 'Skipping 16S hit'
                        else:
                            print '=======\nhit for {0} detected:\nspecies: {1}\n======='.format(query_name, hit_name)
                            hits_dict[query_id].append((hit_name, hit_id))
                    
                print 'Updataing mongoDB with hits'
                hits_collection = kv.get_collection('hits')
                hits_collection.update_one(
                    {'species':query_name},
                    {'$set':{'{}_hits_{}'.format(db, perc_identity):{x:hits_dict[x] for x in hits_dict if hits_dict[x]}}},
                    upsert=True
                    ) 
Exemplo n.º 18
0
def get_distance_matrix(core=False, to_file=True):
    all_species = kv.get_collection('core').distinct('species')
    if core:
        pass
    else:
        all_species.extend(kv.get_collection('other').distinct('species'))

    ssu_species = [n for n  in all_species if kv.db['16S'].find_one({'species':n})]
    distance_matrix = pd.DataFrame(data={n:0.0 for n in ssu_species}, index=ssu_species, columns=ssu_species)
    
    for pair in combinations_with_replacement(ssu_species, 2):
        distance = get_16S_distance(pair[0], pair[1])
        if distance:
            distance_matrix[pair[0]][pair[1]] = distance
            distance_matrix[pair[1]][pair[0]] = distance

    if to_file:
        distance_matrix.to_csv('distance_matrix.csv')
    else:
        return distance_matrix
Exemplo n.º 19
0
def get_links(group=None, perc_identity='99'):
    hits_collection = kv.get_collection('hits')
    group_hits = None
    if not os.path.isdir('circos/links/'):
            os.makedirs('circos/links/')
    out_name = 'circos/links/all_links_{}.txt'.format(perc_identity)
    if group:
        groups = core_hgt_groups()
        group_hits = sorted(groups, key=len, reverse=True)[group - 1]
        out_name = 'circos/links/group{}_links_{}.txt'.format(group, perc_identity)
    
    with open(out_name, 'w+') as out_handle:
        for species in hits_collection.find():
            print species
            try:
                all_hits = species['core_hits_{}'.format(perc_identity)]
                hits_to_write = None
                if group:
                    hits_to_write = {gene:all_hits[gene] for gene in all_hits if (species['species'], gene) in group_hits}
                else:
                    hits_to_write = all_hits
                for gene in hits_to_write:
                    if hits_to_write[gene]:
                        s1_record = kv.get_mongo_record(species['species'], gene)
                        s1_strain = kv.parse_species_name(species['species'])
                        for hit in hits_to_write[gene]:
                            s2_record = kv.get_mongo_record(hit[0], hit[1])
                            s2_strain = kv.parse_species_name(hit[0])
                            out_handle.write('{0}kvc_{1} {2} {3} {4}kvc_{5} {6} {7}\n'.format(
                                s1_strain[2],
                                s1_record['location']['contig'],
                                s1_record['location']['start'],
                                s1_record['location']['end'],
                                s2_strain[2],
                                s2_record['location']['contig'],
                                s2_record['location']['start'],
                                s2_record['location']['end'],
                                )
                            )        
            except KeyError:
                pass
Exemplo n.º 20
0
def get_islands(species_name, perc_identity='99'):
    """
    For each species, combines HGT hits co-occurring within 5kb of eachother
    Returns list of lists of `(species, _id)` tuples
    """
        
    islands = []
    species_hits_list = []
    
    # Add mongo_record for each hit in any gene
    all_hits = kv.get_collection('hits')
    species_hits = all_hits.find_one({'species':species_name})['core_hits_{}'.format(perc_identity)]

    
    for query_id in species_hits:
        if species_hits[query_id]:
            species_hits_list.append(
                kv.get_mongo_record(species_name, query_id)
                )

    for entry_1 in species_hits_list:
        entry_recorded = False
        for entry_2 in species_hits_list:
            if entry_1 == entry_2:
                pass
            elif entry_1['location']['contig'] != entry_2['location']['contig']:
                pass
            else:
                location_1 = entry_1['location']
                location_2 = entry_2['location']
                if abs(location_1['end'] - location_2['start']) <= 5000:
                    entry_recorded = True
                    islands.append([
                        (entry_1['species'], str(entry_1['_id'])),
                        (entry_2['species'], str(entry_2[ '_id']))
                    ])
        if not entry_recorded:
            islands.append([(entry_1['species'], str(entry_1['_id']))])

    return collapse_lists(islands)
Exemplo n.º 21
0
def group_hits(core=False):
    all_species = kv.get_collection('core').distinct('species')
    if not core:
        all_species.extend(kv.get_collection('other').distinct('species'))
    

    hits_db = kv.get_collection('hits')
    species_index = sorted(all_species)
    print species_index
    df = pd.DataFrame()
    core_groups = sorted(core_hgt_groups(), key=len, reverse=True)


    for group in sorted(hits_db.distinct('group')):
        recorded = []
        s = {sp:0.0 for sp in species_index}
        for hit in core_groups[group-1]:
            if not hit in recorded:
                s[hit[0]] += len(kv.get_mongo_record(*hit)['dna_seq'])
                recorded.append(hit)
        
        for hit in hits_db.find_one({'group':group})['group_hits']:
            if float(hit[2]) > 90 and float(hit[3]) > 100:
                if hit[1] not in recorded:
                    s[kv.fasta_id_parse(hit[1])[0]] += float(hit[2])*float(hit[3])/100
                    recorded.append(hit[1])
                
        s = pd.Series(s, name='group_{}'.format(group))
        df['group_{}'.format(group)] = s

    df.to_csv('group_hits_other.csv')

# if __name__ == '__main__':
#     import os
#     kv.mongo_init('pacbio2')
#     os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/')
#     # group_hits(core=True)
#     # output_groups()
#     # core_hgt_stats()
#     output_hits_csv()
Exemplo n.º 22
0
def get_tree(core=False, newick=False):
    core_collection = kv.get_collection('core')
    all_species = core_collection.distinct('species')
    if core:
        pass
    else:
        other_collection = kv.get_collection('other')
        all_species.extend(other_collection.distinct('species'))
    ssu_species = [n for n  in all_species if kv.db['16S'].find_one({'species':n})]
    
    dm = DistanceMatrix(get_distance_matrix(core=core, to_file=False), ssu_species)
    t = tree.nj(dm)
    print t.ascii_art()
    tips = []
    for node in t.tips():
        print node.name, node.length
        tips.append(node.name.replace(' ', '_'))
    if newick:
        n = tree.nj(dm, result_constructor=str)
        print n
    else:
        return (t, tips)
Exemplo n.º 23
0
def all_by_all(species_1, species_2):
    # results = fasta_blast(species_1, species_2)
    results = 'pairwise_blast/{}_{}-blast_results.xml'.format(species_1, species_2)
    if results:
        with open(results, 'r') as result_handle:
            blast_records = NCBIXML.parse(result_handle)
            hits_list = []
            for blast_record in blast_records:
                qsp, qid = kv.fasta_id_parse(blast_record.query)
                query_record = kv.get_mongo_record(qsp, qid)
                for alignment in blast_record.alignments:
                    asp, aid = kv.fasta_id_parse(alignment.hit_def)
                    alignment_record = kv.get_mongo_record(asp, aid)
                    for hsp in alignment.hsps:
                        if hsp.align_length > 100:
                            pident = float(hsp.positives)/float(hsp.align_length)
                            length = hsp.align_length
                            hits_list.append((query_record, alignment_record))
                        break
                    break
            return hits_list
    else:
        print "Blast didn't work for some reason"
Exemplo n.º 24
0
def output_groups_by_species(min_group_size=2):
    all_species = kv.get_species_collections()
    groups_list = get_groups()
    groups_list.sort(key=len, reverse=True)

    groups_df = pd.DataFrame(data={n:0 for n in all_species}, index=[str(x+1) for x in range(0, len(groups_list))])

    group_no = 0
    for group in groups_list:
        if len(group) >= min_group_size:
            group_no += 1
            species_in_group = [x[0] for x in group]
            for species in species_in_group:
                groups_df[species][group_no-1] = 1
    groups_df.to_csv('groups_by_species.csv')
Exemplo n.º 25
0
def output_groups(output_file='default', min_group_size=2):
    if output_file == 'default':
        output_file = 'groups.csv'.format(kv.db.name)
        df_index = [
            'groups',
            'species',
            'kvtag',
            'contig',
            'start',
            'end',
            'strand',
            'annotation',
            'dna_seq',
        ]
        df = pd.DataFrame()
        group_no= 0
        groups_list = get_groups()
        groups_list.sort(key=len, reverse=True)

        for group in groups_list:
            if len(group) >= min_group_size:
                group_no += 1
                # Entry is `(species, id)`
                for entry in group:
                    db_handle = kv.get_mongo_record(*entry)
                    
                    annotation = db_handle['annotation'].replace(',','')
                    series = pd.Series(
                        [str(group_no).zfill(3),
                        db_handle['species'],
                        db_handle['kvtag'],
                        db_handle['location']['contig'],
                        db_handle['location']['start'],
                        db_handle['location']['end'],
                        db_handle['location']['strand'],
                        annotation,
                        db_handle['dna_seq']
                        ],
                        index=df_index,
                        name=db_handle['kvtag']
                    )
                    df=df.append(series)
        df.to_csv(output_file, columns=df_index)
Exemplo n.º 26
0
def add_ssu(supp_file):
    # df = pd.read_csv(supp_file)
    # print df.columns
    # new_df = pd.DataFrame()
    # # for i in range(len(df['Strain'])):
    #     # print df['Strain'][i].replace(' ', '_').replace('.', '')

    # strain = pd.Series([df['Strain'][i].replace(' ', '_').replace('.', '') for i in range(len(df['Strain']))], name='strain')
    # ssus = df['sequences of the 16s rRNA genes']
    
    # ssu = pd.Series([ssus[i].replace(r'\n', '') if not pd.isnull(ssus[i]) else None for i in range(len(ssus))], name='16S')
    # new_df['strain'] = strain
    # new_df['16S'] = ssu

    # new_df.to_csv('ssu.csv')

    ssu_df = pd.read_csv(supp_file)
    for i in range(len(ssu_df['strain'])):
        print ssu_df['strain'][i], ssu_df['16S'][i]
        if not pd.isnull(ssu_df['16S'][i]):
            gene_record = {
                'species':ssu_df['strain'][i],
                'location':{
                    'contig':None,
                    'start':None,
                    'end':None,
                    'strand':None,
                },
                'annotation':'Small subunit ribosomal RNA',
                'dna_seq':ssu_df['16S'][i],
                'kvtag':None,
                'type':'16S'
                }

            print "adding 16S gene!"
            kv.get_collection('16S').remove({'species':ssu_df['strain'][i]})
            print kv.get_collection('16S').find_one({'species':ssu_df['strain'][i]})
            kv.get_collection('16S').insert_one(gene_record)
            print kv.get_collection('16S').find_one({'species':ssu_df['strain'][i]})
Exemplo n.º 27
0
def output_groups(min_group_size=2):
    """
    Returns .csv file with information for each CDS in an HGT group

    - Optional: set minimum number of CDS to be considered a group
    """ 
    output_file = '{}_groups.csv'.format(kv.db.name)
    df_index = ['group','kvtag','contig','start','end','strand','annotation','dna_seq']
    df = pd.DataFrame()
    group_no= 0
    groups_list = core_hgt_groups()
    groups_list.sort(key=len, reverse=True)

    for group in groups_list:
        if len(group) >= min_group_size:
            group.sort(key=lambda entry:entry[0])
            group_no += 1
            for entry in group: # Entry is `(species, id)`
                
                db_handle = kv.get_mongo_record(*entry)
                annotation = db_handle['annotation'].replace(',','') # prevents CSV screw-up
                series = pd.Series(
                    [str(group_no).zfill(3),
                    db_handle['kvtag'],
                    db_handle['location']['contig'],
                    db_handle['location']['start'],
                    db_handle['location']['end'],
                    db_handle['location']['strand'],
                    annotation,
                    db_handle['dna_seq']
                    ],
                    index=df_index,
                    name=db_handle['species']
                )
                df=df.append(series)
    df.to_csv(output_file, columns=df_index)
Exemplo n.º 28
0
    
    for i in range(len(groups_list)):
        group_hits = []
        kv.make_id_list_fasta(groups_list[i], 'core')
        results = blast_vs_db('tmp.fna', 'blast_databases/other')

        hits_collection = kv.get_collection('hits')
        if results:
            for j in range(len(results)):
                group_hits.append(results[j])
        hits_collection.insert_one({'group':(i+1), 'group_hits':group_hits})


if __name__ == '__main__':
    import sys
    kv.mongo_init('pacbio2')
    os.chdir('/Users/KBLaptop/computation/kvasir/data/output/pacbio2/')
    # kv.mongo_init(sys.argv[1])
    # os.chdir('output/{}/'.format(sys.argv[1]))
    make_blast_db('core')
    make_blast_db('other')
    hits_reset()
    hgt_blast(perc_identity='90')
    hgt_blast(perc_identity='95')
    hgt_blast(perc_identity='99')
    blast_to_db(perc_identity='90')
    blast_to_db(perc_identity='95')
    blast_to_db(perc_identity='99')
    
    other_blast()
Exemplo n.º 29
0
def make_blast_db(source, name=None, remove_source=True):
    """
    Produces BLAST database from `source`
    Optional - provide name (defaults to `source`)
    Set remove_source=False to keep fasta file (if created)
    
    Source types:
    - fasta file (use path, must end with `.fna`)
    - Mongo collection (use name of collection)
    - list of dicts containing at least keys `species`, `_id`, `dna_seq`
    - Mongo cursor eg. `collection.find({'key':value})`
    """
    # If there's no directory for blast db's, create one
    if not os.path.isdir('blast_databases/'):
        os.makedirs('blast_databases/')
    
    output_fasta = None
    
    if os.path.isfile(source):
        # Input is fasta file?
        if source.endswith('.fna'):
            output_fasta = source
            if not name:
                name = os.path.basename(source)[:-4]
            remove_source = False
        else:
            print "Not a valid file type, use .fna"

    else:
        output_fasta = '{0}_all.fasta'.format(kv.db.name)     
        genes = None
        with open(output_fasta, 'w+') as output_handle:
            if source in kv.get_collections():
                genes = kv.get_collection(source).find()
                if not name:
                    name = source
            elif type(source) == list:
                genes = source
            elif type(source) == Cursor:
                genes = source
        
            for gene in genes:
                output_handle.write('>{0}|{1}\n{2}\n'.format(
                    gene['species'],
                    gene['_id'],
                    gene['dna_seq'],
                    )
                )

    while not name:
        name = str(raw_input("enter name for BLAST database: "))

    # calls makeblastdb from shell
    print "making a database!"
    Popen(
        ['makeblastdb',
        '-in', output_fasta,
        '-dbtype', 'nucl',
        '-out', 'blast_databases/{0}'.format(name),
        '-title', name,
        ]
    ).wait() # waits for this operation to terminate before moving on

    if remove_source:
        os.remove(output_fasta)
Exemplo n.º 30
0
def hits_reset():
    kv.remove_collection('hits')