def blast_dico2taxonomy_info(blast_dico):
    import sequence_id2scientific_classification

    accession2hits_classification = {}
    for n, accession in enumerate(blast_dico.keys()):
        print n, accession
        accession2hits_classification[accession] = {}
        all_taxon_ids = []
        for n, one_hit_accession in enumerate(blast_dico[accession].keys()):
            taxon_ids = blast_dico[accession][one_hit_accession][0]['subject_taxid'].split(";")
            all_taxon_ids+=taxon_ids


        print "ids:", all_taxon_ids

        taxon_id2classification = sequence_id2scientific_classification.taxon_id2scientific_classification(all_taxon_ids)

        for n, one_hit_accession in enumerate(blast_dico[accession].keys()):
            accession2hits_classification[accession][one_hit_accession] = []
            taxon_ids = blast_dico[accession][one_hit_accession][0]['subject_taxid'].split(";")
            for taxon in taxon_ids:
                try:
                    accession2hits_classification[accession][one_hit_accession].append(taxon_id2classification[taxon])
                except KeyError:
                    accession2hits_classification[accession][one_hit_accession].append(None)

    return accession2hits_classification
示例#2
0
def accession2full_taxonomic_path(accession_list, database="nucleotide"):
    import sequence_id2scientific_classification

    taxon_id_list = accession2taxon_id(accession_list, database).values()

    #print 'taxon_id list',

    classif = sequence_id2scientific_classification.taxon_id2scientific_classification(
        taxon_id_list)

    return classif
def accession2taxon_rank(accession, rank='phylum'):
    from Bio import Entrez
    import sequence_id2scientific_classification

    Entrez.email = "*****@*****.**"

    handle1 = Entrez.esearch(db="nuccore", term=accession)
    record1 = Entrez.read(handle1)

    ncbi_id = record1['IdList'][0]

    handle2 = Entrez.elink(dbfrom="nuccore", db="taxonomy", id=ncbi_id)
    record2 = Entrez.read(handle2)

    id = record2[0]['LinkSetDb'][0]['Link'][0]['Id']

    taxo_data = sequence_id2scientific_classification.taxon_id2scientific_classification(
        [id])
    try:
        data = taxo_data[id][rank]
    except:
        print(accession, ncbi_id, taxo_data[id])
        return (False)
    return data
示例#4
0
def insert_taxons_into_sqldb(taxon_id_list,
                             chunk_size=300,
                             mysql_host='localhost',
                             mysql_user='******',
                             mysql_pwd='wnkonwn',
                             mysql_db='blastnr'):
    import time
    import MySQLdb
    import sequence_id2scientific_classification
    import re

    conn = MySQLdb.connect(
        host=mysql_host,  # your host, usually localhost
        user=mysql_user,  # your username
        passwd=mysql_pwd,  # your password
        db=mysql_db)  # name of the data base
    cursor = conn.cursor()

    taxid2classification = {}

    id_lists = _chunks(taxon_id_list, chunk_size)
    for i, one_list in enumerate(id_lists):
        print(i, "/", len(id_lists))
        if i % 100 == 0 and i != 0:
            time.sleep(60)
        taxid2classification.update(
            sequence_id2scientific_classification.
            taxon_id2scientific_classification(one_list))

    print('Number of taxon id retrieved:', len(taxid2classification.keys()))

    print('Updating blastnr_taxonomy table with %s new taxons' %
          str(len(taxon_id_list)))
    for taxon_id in taxon_id_list:
        if taxon_id == 'N/A':
            continue

        sql = 'INSERT INTO blastnr_taxonomy(taxon_id) values (%s)' % (taxon_id)
        try:

            cursor.execute(sql)
            conn.commit()
        except MySQLdb.IntegrityError:
            print('Taxon %s already in database' % str(taxon_id))
            continue
        try:
            for rank in taxid2classification[taxon_id].keys():

                sql_id = re.sub(' ', '_', rank)

                value = taxid2classification[taxon_id][rank]
                sql = 'UPDATE blastnr_taxonomy SET `%s`="%s" where taxon_id=%s' % (
                    sql_id, value, taxon_id)
                #print sql
                try:
                    cursor.execute(sql)
                    conn.commit()
                except:
                    print('could not insert rank', sql)
        except KeyError:
            print(
                'Could not add the following taxon: %s, trying again to get the data...'
                % taxon_id)
            temp_dico = sequence_id2scientific_classification.taxon_id2scientific_classification(
                [taxon_id])

            try:
                for rank in temp_dico[taxon_id].keys():

                    sql_id = re.sub(' ', '_', rank)

                    value = temp_dico[taxon_id][rank]
                    sql = 'UPDATE blastnr_taxonomy SET `%s`="%s" where taxon_id=%s' % (
                        sql_id, value, taxon_id)

                    cursor.execute(sql)
                    conn.commit()
                print('sucess!')
            except KeyError:
                print('Could not add %s' % taxon_id)
                print("temp_dico", temp_dico)
示例#5
0
    import argparse
    import sequence_id2scientific_classification
    parser = argparse.ArgumentParser()
    parser.add_argument("-t",
                        '--taxon_id',
                        type=str,
                        help="taxon ncbi id",
                        nargs='+')
    parser.add_argument("-r",
                        '--rank',
                        action="store_true",
                        help="get taxon rank ")

    args = parser.parse_args()

    classif = sequence_id2scientific_classification.taxon_id2scientific_classification(
        args.taxon_id)
    #print classif
    if not args.rank:

        for accession in classif:
            data = classif[accession]
            try:
                superkingdom = data["superkingdom"]
            except:
                superkingdom = "-"
            try:
                phylum = data["phylum"]
            except:
                phylum = "-"
            try:
                class_ = data["class"]