Пример #1
0
def fetch_refseq(path, strain_lst, species_to_search='Mycoplasma genitalium'):
    """ download NCBI refseq GenBank file from strain list """
    import os, sys, time, glob, csv
    from Bio import GenBank
    from sf_miscellaneous import write_pickle
    #species_to_search
    ## fetch the newest refseq assembly_summary file
    os.system('wget -c ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt > %sassembly_summary.txt'%path)
    with open('assembly_summary.txt','rb') as csvfile:
        outfile='downloadlink.txt'
        with open(path+outfile,'wb') as output:
            csv_reader = csv.reader(csvfile, delimiter='\t')
            headers = csv_reader.next()
            for icsv_line in csv_reader:
                # species name and complete
                if species_to_search in icsv_line[7] and 'Complete' in icsv_line[11]:
                    #os.system('wget -c %s/%s'%(icsv_line[19],'*_genomic.gbff.gz -P ./Refseq/Mt'))
                    output.write('%s/%s\n'%(icsv_line[19],'*_genomic.gbff.gz'))

    gbk_path='%sinput_GenBank/'%path
    command_download='wget -c --input %sdownloadlink.txt -P %s'%(path,gbk_path)
    os.system(command_download)
    command_gunzip='gunzip %s*.gz'%gbk_path
    os.system(command_gunzip)
    for each_gbk_path in glob.iglob('%s*gbff*'%gbk_path):
        with open(each_gbk_path) as gbk_file:
            for record in GenBank.parse(gbk_file):
                print(each_gbk_path,record.accession[0])
                break
            os.system('mv %s %s%s.gbk'%(each_gbk_path, gbk_path, record.accession[0]))

    if 0:
        os.chdir(path)
        species=glob.glob('*txt')[0].split('_list.')[0]
        os.system('rm *txt; rm *sh')
        os.system('gunzip *')
        while len(glob.glob('*.gz'))!=0:
            time.sleep(5)
        # rename gbk file
        for each_gbk_path in glob.iglob('*gbff*'):
            with open(each_gbk_path) as handle:
                print handle
                for record in GenBank.parse(handle):
                    print(each_gbk_path,record.accession[0])
                    break
            os.system('mv %s %s'%(each_gbk_path, record.accession[0]))
        for each_gbk_path in glob.iglob('*'):
            os.system('mv %s %s.gbk'%(each_gbk_path, each_gbk_path))
            #os.system('mv %s %s'%(each_gbk_path, each_gbk_path.split('.')[0]))
        os.system('ls *gbk > %s-RefSeq.txt; sed -i -- "s/.gbk//g" *txt'%species)
        os.system('wc -l *txt ; ls *gbk |wc -l')
        path='../../pan-genome-analysis/'
        os.system('cp %srun-TestSet-template.sh %srun-%s.sh; sed -i -- "s/TestSet/%s/g" %srun-%s.sh'%(path,path,species,species,path,species))
        os.system('mv ../%s/ %sdata/'%(species,path))
Пример #2
0
def main():
    sorted_pos_tagged = {}
    db_handler = DatabaseHandler()
    for entry in os.scandir(shb_records):
        graph = rdflib.Graph()
        graph.load(entry.path)
        for s, p, o in graph:
            if p in predicate_whitelist and isinstance(o, rdflib.Literal):
                pos_tagged = mine_sentences(o)
                for p in pos_tagged:
                    if p[1] in sorted_pos_tagged.keys():
                        sorted_pos_tagged[p[1]].append(p[0])
                    else:
                        sorted_pos_tagged[p[1]] = [p[0]]

    for entry in os.scandir(gbk_records):
        with open(entry) as handle:
            for record in GenBank.parse(handle):
                graph = db_handler._db_util.db_mapping_calls[
                    "genbank"].generalise_get_results(record)
                for s, p, o in graph:
                    if isinstance(o, rdflib.Literal):
                        pos_tagged = mine_sentences(o)
                        for p in pos_tagged:
                            if p[1] in sorted_pos_tagged.keys():
                                sorted_pos_tagged[p[1]].append(p[0])
                            else:
                                sorted_pos_tagged[p[1]] = [p[0]]

    f = open("summary.txt", "a+")
    for k, v in sorted_pos_tagged.items():
        v = list(set(v))
        print(k, v)
        print("\n")
        f.write(f'{k} - {"-".join(v)}\n')
Пример #3
0
def main():
	try: os.mkdir(OUTDIR)
	except OSError: print 'Using existing directory %s' % OUTDIR
	with open(INFILE) as handle:
		records = [r for r in GenBank.parse(handle)]
	c = Counter([r.references[0].pubmed_id for r in records])
	del c['']
	pubs = c.keys()
	seqdict = {k:[r for r in records if r.references[0].pubmed_id == k] for k in pubs}
	for pub in pubs:
		qfas(seqdict[pub], OUTDIR+'/'+pub+'.fasta')
Пример #4
0
def load_samples(sequences):

    with open("data/genbank_sequences.gb") as handle:
        #Use biopython to parse the GenBank records
        for record in GenBank.parse(handle):
            #skip partial sequence records
            if ('partial' in record.definition):
                continue

            #For now id for a sample will include a truncated version of the country of origin, date collected
            #and the accession number of the record.
            accession = record.accession[0]
            source = findFeature(record, 'source')
            if source is not None:
                country = findItem(source, '/country=')
                col_date = findItem(source, '/collection_date=')
            id = accession
            if col_date is not None:
                dt = dateutil.parser.parse(
                    col_date)  # Time formatting is not consistent
                norm_date = dt.strftime(r'%Y-%m-%d')
                id = norm_date + '-' + id
            if country is not None:
                country = country.replace(':', ' ')
                id = country.split()[0][:7].strip() + '-' + id

            #For each CDS record
            genes = findAllFeature(record, 'CDS')
            for gene in genes:
                #First figure out the gene / protein name.  Try both the product tag and gene tag
                #NOTE: We are not interested in the post translation non structural protein products.
                #      We just process the orf1ab gene that has all of them embedded in it.
                product = findItem(gene, '/product=')
                if product is not None:
                    gene_name = product.split()[0].upper()
                    #A few proteins have aliases, map them to the standard form.
                    if gene_name in geneAlias:
                        gene_name = geneAlias[gene_name]

                if gene_name is None or gene_name not in validList:
                    gene_name = findItem(gene, '/gene=')
                    if (gene_name is not None and gene_name in geneAlias):
                        gene_name = geneAlias[gene_name]

                if (gene_name not in validList):
                    continue

                sequence = findItem(gene, '/translation=')
                if (id is not None and sequence is not None):
                    if gene_name == 'ORF1AB':
                        loadOrf1AB(sequences, id, sequence)
                    else:
                        sequences[gene_name].append(id + '|' + sequence)
Пример #5
0
 def _get_organella(self, gb_file):
     """
     Retrive the organelle from the genbank file, using the specific GenBank object,
     because SeqIO does not support this field
     """
     organella = {}
     with open(gb_file, "r") as gbh:
         for record in GenBank.parse(gbh):
             accession = record.version
             for q in record.features[0].qualifiers:
                 if q.key == "/organelle=":
                     organelle = q.value.replace('"', '')
                     organella[record.version] = organelle
     return organella
Пример #6
0
 def parse_genebank_file(self):
     with open(self.gene_bank_file, "rU") as input_handle:
         for record in GenBank.parse(input_handle):
             #print("Name:  %s, %i" % (record.name, len(record.features)))
             print(record.features)
             print(record.accession)
             print("----")
             print(record.gi)
             print("----")
             self.Acc = record.accession[0]
             
             if self.GI is None or len(self.GI) == 0:
                 self.GI = "NA"
             if self.Acc is None or len(self.Acc) == 0:
                 self.Acc = "NA"
Пример #7
0
def get_record_list():
    """
    Parses through and generates a list of dictionaries containing all required information about records.

    Makes calls to other functions to format location and record data
    """

    with open(INPUT_PATH) as handle:

        record_list = []

        for gbk_record in GenBank.parse(handle):
            record_list.append({
                "locus":
                gbk_record.locus,
                "features": [feature for feature in gbk_record.features]
            })

    return format_record_list(record_list)
    def readGeneBankFile(file):
        """
            Author: Marco Roelfes
            readGeneBankFile(file)
            file: genebank file
            readGeneBankFile reads genebank files to a dictionary(key: title(definition+accession+locus+source), value: sequence)
            returns dictionary with key: title and value: sequence
        """
        sequences = {}
        #loop trough record to parse sequence in Seq object
        for gb_record in SeqIO.parse(open(file,"r"), "genbank") :
            #set seq
            seq = gb_record.seq



        #loop through file to parse headers
        for record in GenBank.parse(open(file)):
            #set title
            title=">" + record.definition + "accesion: " + record.accession[0] + " " +record.locus +" " + record.source
        #put title,sequence to dictionary sequences
        sequences[title] = seq
        return sequences
Пример #9
0
def main():
    """ Main function"""
    # Get command-line arguments
    args = get_args()
    # If any of the arguments is missing, fill in with default values
    if args.in_file is None:
        in_file = 'ecis-screen_summary.txt'
    else:
        in_file = args.in_file
    if args.out_file is None:
        out_file = 'cis_locus_genes.txt'
    else:
        out_file = args.out_file
    if args.gbk_dir is None:
        gbk_dir = 'genomes'
    else:
        gbk_dir = args.gbk_dir
    # read eCIS-screen output file
    cis_data = read_cis_summary(in_file)
    candidate_proteins = []
    with open(out_file, 'w') as outfile:
        for assembly in cis_data:
            # Make GenBank file path
            gbk_file = os.path.join(gbk_dir, assembly,
                                    assembly + '_genomic.gbff.gz')
            print('Now reading', gbk_file)
            for seq_record in GenBank.parse(gzip.open(gbk_file, 'rt')):
                if seq_record.accession[0] not in cis_data[assembly]:
                    continue
                print('Sequence found', seq_record.accession[0])
                # Get list of genes
                neighbors = get_neighbors(
                    assembly, seq_record,
                    cis_data[assembly][seq_record.accession[0]])
                # Write list of genes to the output file
                outfile.write('\n'.join(neighbors) + '\n\n')
Пример #10
0
    # Write logs to STDOUT
    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(logFormatter)
    rootLogger.addHandler(consoleHandler)

    dat = []

    # Download viral genomes from NCBI
    for url in [
            "ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/viral.1.protein.gpff.gz",
            "ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/viral.2.protein.gpff.gz",
    ]:
        run_cmds(["wget", url])
        fp = url.split("/")[-1]
        with gzip.open(fp, "rt") as handle:
            for ix, record in enumerate(GenBank.parse(handle)):
                taxid = None
                product = None
                locus_tag = None
                coded_by = None
                genome = None
                genome_range = None

                for feature in record.features:
                    for qualifier in feature.qualifiers:
                        if feature.key == "source":
                            if qualifier.key == "/db_xref=":
                                taxid = qualifier.value.strip('"')
                                taxid = taxid.replace("taxon:", "")
                        elif feature.key == "Protein":
                            if qualifier.key == "/product=":
Пример #11
0
        idnum = (record['IdList'][1])
    else:
        idnum = (record['IdList'][0])
    handle.close()

    # Retrive the record from Enterz protein database
    # using the ID number saved from the previous step
    # save the result as GenBank file
    handle2 = Entrez.efetch(db="protein",
                            id=idnum,
                            rettype="gb",
                            retmode="text")

    # Parse through the GenBank file to find data needed for gene_info table
    # like version, length and full name of the gene and stor the result
    # into new file to be able to insert them easily
    for record in GenBank.parse(handle2):
        version = record.version + "\t"
        length = record.size + "\t"
        fullinfo = (record._definition_line()).strip()
        fullinfo = fullinfo.replace("DEFINITION ", "").strip()
        fullinfo = re.sub("\[Chlamydia.*\n*.*].", "", fullinfo).strip()
        insert = locus + version + symbol + length + fullinfo + '\n'
        outFile.write(insert)
    handle2.close()

# Close the cursor, connection and outFile
cursor.close()
conn.close()
outFile.close()
def _read_file(fn):
    records = []
    with open(fn) as handle:
        for record in GenBank.parse(handle):
            records.append(record)
    return records
Пример #13
0
#!/usr/bin/python3
import sys
import Bio
from Bio import GenBank

#for the Restriction_Enzyme table
Enz_name = {'EcoRI':{"GAATTC"}, 'BamHI':{"GGATCC"},'BsuMI':{"CTCGAG"}}

#create parser
parser = GenBank.RecordParser()
record = parser.parse(open("chrom_CDS_8"))
#create empty list to put all information into
List = []

Insert = {}
for record in GenBank.parse(open("chrom_CDS_8")):
	"""
	To extract each piece of information and insert values from each gene one at a time empty dictionary 
	"""
	#obtain accession and DNA sequence
	Insert['accession'] = record.accession
	Insert['seq'] = record.sequence
	#create flags for genes without information in entry
	has_gene=0
	has_loc=0
	has_pp=0
	has_ex_strt=0
	has_ex_end = 0
	has_aa = 0 
	#create empty lists to allow for multiple entries of exons
	exon_start_list = []
Пример #14
0
def process_genbank(gb_filename):
    """
    Process GenBank file name
    :param gb_filename: GenBank filename
    :return: dict of metadata, SeqRecord
    """
    info = {'File': None,
            'Virus name': None,
            'Accession ID': None,
            'Type': '',
            'Passage details/history': '',
            'Collection date': '',
            'Location': '',
            'Host': '',
            'Additional location information': '',
            'Gender': '',
            'Patient age' : '',
            'Patient status': '',
            'Specimen source': '',
            'Additional host information': '',
            'Outbreak': '',
            'Last vaccinated': '',
            'Treatment': '',
            'Sequencing technology': '',
            'Assembly method': '',
            'Coverage': '',
            'Comment': '',
            'Originating lab': '',
            'Address': '',
            'Sample ID given by the sample provider': '',
            'Submitting lab': '',
            'Address 1': '',
            'Sample ID given by the submitting laboratory': '',
            'Authors': '',
            'Submitter': '',
            'Submission Date': '',
            'Address 2': '',
            'Continent': '',
            'Country': ''}

    reader = GenBank.parse(open(gb_filename))
    r = next(reader)

    # In [55]: r.features[0]
    # Out[55]: Feature(key='source', location='1..29688')
    # In [56]: r.features[0].qualifiers
    # Out[56]:
    # [Qualifier(key='/organism=', value='"Severe acute respiratory syndrome coronavirus 2"'),
    #  Qualifier(key='/mol_type=', value='"genomic RNA"'),
    #  Qualifier(key='/isolate=', value='"SARS-CoV-2/WA-UW297/human/2020/USA"'),
    #  Qualifier(key='/host=', value='"H**o sapiens"'),
    #  Qualifier(key='/db_xref=', value='"taxon:2697049"'),
    #  Qualifier(key='/country=', value='"USA: WA"'),
    #  Qualifier(key='/collection_date=', value='"2020-03-15"')]

    info['File'] = gb_filename
    info['Accession ID'] = r.version
    info['Virus name'] = r.locus # later can rewrite if "/isolate=" is available
    info['Submission Date'] = r.date

    assert r.features[0].key=='source'
    q = dict((x.key,x.value) for x in r.features[0].qualifiers)
    if '/collection_date=' in q:
        info['Collection date'] = q['/collection_date=']
    if '/host=' in q:
        host, gender, age = process_host(q['/host='])
        info['Host'] = host
        info['Gender'] = gender
        info['Age'] = age
    if '/country=' in q:
        location, continent, country = process_country(q['/country='])
        info['Location'] = location
        info['Continent'] = continent
        info['Country'] = country
    if '/isolation_source=' in q:
        info['Specimen source'] = q['/isolation_source=']
    if '/isolate=' in q:
        info['Virus name'] = q['/isolate=']

    # In [94]: print(r.references[0])
    # REFERENCE   1  (bases 1 to 29838)
    #   AUTHORS   Chan,J.F.-W., Yuan,S., Kok,K.H., To,K.K.-W., Chu,H., Yang,J.,
    #             Xing,F., Liu,J., Yip,C.C.-Y., Poon,R.W.-S., Tsai,H.W., Lo,S.K.-F.,
    #             Chan,K.H., Poon,V.K.-M., Chan,W.M., Ip,J.D., Cai,J.P.,
    #             Cheng,V.C.-C., Chen,H., Hui,C.K.-M. and Yuen,K.Y.
    #   TITLE     A familial cluster of pneumonia associated with the 2019 novel
    #             coronavirus indicating person-to-person transmission: a study of a
    #             family cluster
    #   JOURNAL   Lancet (2020) In press
    #   REMARK    Publication Status: Available-Online prior to print
    info['Authors'] = r.references[0].authors
    info['Address'] = r.references[0].journal
    if len(r.references)>1:
        info['Submitter'] = r.references[1].authors
        info['Address 2'] = r.references[1].journal

    # In [113]: r.structured_comment
    # Out[113]:
    # OrderedDict([('Assembly-Data',
    #               OrderedDict([('Assembly Method', 'minimap2 v. 14 Jan 2020'),
    #                            ('Sequencing Technology', 'Nanopore')]))])
    try:
        if 'Assembly-Data' in r.structured_comment:
            x = r.structured_comment['Assembly-Data']
            if 'Sequencing Technology' in x:
                info['Sequencing technology'] = x['Sequencing Technology']
            if 'Assembly Method' in x:
                info['Assembly method'] = x['Assembly Method']
    except:
        pass

    # In [16]: r.taxonomy
    # Out[16]:
    # ['Viruses',
    #  'Riboviria',
    #  'Nidovirales',
    #  'Cornidovirineae',
    #  'Coronaviridae',
    #  'Orthocoronavirinae',
    #  'Betacoronavirus',
    #  'Sarbecovirus']
    info['Type'] = r.taxonomy[-2]

    _id = r.version
    if info['Virus name']!='':
        _id += '|' + info['Virus name']
    else:
        _id += '|' + r.definition
    return info, SeqRecord(r.sequence, id=_id)
Пример #15
0
def get_ncbi_genomes(args):
    '''
    Main function. It takes genomes and adds 
    information to metadata such as sequence 
    length, completeness, and information 
    from ICTV.
    
    '''
    ####### Name variables for ease of use ######
    accessions = args.input
    metadata = args.metadata
    directory = args.directory
    filename = args.output
    check(accessions, metadata)
    Entrez.email = args.email
    ictv = args.ictv
    genbank = args.genbank

    ##############################################

    list_of_lists = acc_list(args.input)  #Load split accession items
    folder = os.path.join(
        directory + 'database_' + datetime.now().strftime('%Y-%m-%d')
    )  #create PATH to new folder in path with all databases
    os.makedirs(folder)  #create this folder
    old_directory = os.getcwd()  #check your current directory
    new_directory = folder
    for file in os.listdir(old_directory):  #copy files to new directory
        if file.endswith(".acc"):
            source_acc = os.path.join(old_directory, accessions)
            destination_acc = os.path.join(new_directory, accessions)
            shutil.copyfile(source_acc, destination_acc)
        elif file.endswith(".csv"):
            source_csv = os.path.join(old_directory, metadata)
            destination_csv = os.path.join(new_directory, metadata)
            shutil.copyfile(source_csv, destination_csv)
        elif file.endswith(".xlsx"):
            source_ictv = os.path.join(old_directory, ictv)
            destination_ictv = os.path.join(new_directory, ictv)
            shutil.copyfile(source_ictv, destination_ictv)
        else:
            continue
    print(
        '************************\nMoved files to new directory\n************************'
    )

    for elem in list_of_lists:  #for every 7000 genomes
        net_handle = Entrez.efetch(db="nucleotide",
                                   id=elem,
                                   rettype="fasta",
                                   retmode="text")
        out_handle = open(os.path.join(new_directory, filename),
                          "a+")  #open new file
        out_handle.write(net_handle.read())
        out_handle.close()
        net_handle.close()
        print("Part of genomes saved")
    print('All genomes saved\n************************')

    print('Starting processing \n************************')

    metadata_df = pd.read_csv(os.path.join(
        new_directory, metadata))  #read dataframe with metadata

    metadata_accession = metadata_df['Accession'].tolist(
    )  #save all rows from metadata column "Accession" to list

    headers, seqs = fastaParser(os.path.join(
        new_directory,
        filename))  #read headers and sequences from multigenome fasta file
    flat_seqs = [item for sublist in seqs for item in sublist]
    genomes = list(zip(headers, flat_seqs))
    ncount = [seq.count('N') for header, seq in genomes
              ]  #count how many "N"'s in every sequence
    accession = re.findall(r'[A-Z]+\w?\d{5,8}\.\d',
                           str(headers))  #find accessions ids in headers
    complete = [item[0] for item in genomes if 'complete' in item[0]
                ]  #make list with headers if "complete" is in element
    accession_complete = re.findall(
        r'[A-Z]+\w?\d{5,8}\.\d',
        str(complete))  #find accessions ids in headers
    partial = [item[0] for item in genomes if 'partial' in item[0]
               ]  #create list with headers if "partial" is in element
    accession_partial = re.findall(
        r'[A-Z]+\w?\d{5,8}\.\d', str(partial))  #find accessions ids in headers
    length = [len(seq) for header, seq in genomes
              ]  #make a list with length of sequences

    #### DICTIONARIES -> They'll be add to metadata dataframe
    partial_d = {k: 'partial' for k in accession_partial}
    complete_d = {k: 'complete' for k in accession_complete}
    length_d = dict(zip(accession, length))
    ncount_d = dict(zip(accession, ncount))
    updated_completness = complete_d.copy()
    updated_completness.update(partial_d)

    print('************************\nChecking all removed genomes')

    url_list = []  #empty list to contain urls

    for elem in metadata_accession:  #for every accession record
        url = "https://www.ncbi.nlm.nih.gov/nuccore/" + elem  #create path to website
        url_list.append(url)  #add url path to list

    the_word = 'Record removed'
    count_list = [
    ]  #empty list to contain information if record was removed from the NCBI site

    for url in url_list:
        r = requests.get(url, allow_redirects=False)
        soup = BeautifulSoup(r.content.lower(), 'lxml')
        words = soup.find_all(text=lambda text: text and the_word.lower() in
                              text)  #find string from the_word value
        count = len(
            words)  #how many strings from the_word was on the NCBI site
        count_list.append(count)  #add count value to list
        print('\nUrl: {}\ncontains {} of words: {}'.format(
            url, count, the_word))
    #removed_df =  pd.DataFrame(list(zip(metadata_accession, count_list)), columns =['Accession', 'Removed'])
    #####REMOVED DICTIONARY
    removed_d = (
        dict(zip(metadata_accession, count_list))
    )  #dictionary with accessions and information if record was removed

    print('************************\nAdding information to metadata')
    ############ ADD ALL DICTIONARIES TO METADATA
    metadata_df['Nuc_length'] = metadata_df['Accession'].map(length_d)
    metadata_df['Ncount'] = metadata_df['Accession'].map(ncount_d)
    metadata_df['Removed'] = metadata_df['Accession'].map(removed_d)
    metadata_df['Completness'] = metadata_df['Accession'].map(
        updated_completness)

    partial_df = metadata_df.loc[
        metadata_df['Completness'] ==
        'partial']  #check if there is any partial genome in data
    partial_list = partial_df['Accession'].tolist()

    print(
        '************************\nDownload genbank file and change information about completness'
    )

    list_of_lists = partial_list

    ##### IF THERE IS ANY "PARTIAL" GENOME DOWNLOAD GENBANK FILE AND CHECK IF IT'S GOT "full length" in comment section.
    for elem in list_of_lists:
        if len(elem) > 0:
            net_handle = Entrez.efetch(db="nucleotide",
                                       id=elem,
                                       rettype="gb",
                                       retmode="text")
            out_handle = open(os.path.join(new_directory, genbank), "a+")
            out_handle.write(net_handle.read())
            out_handle.close()
            net_handle.close()
            print(elem, "Saved")

            with open(os.path.join(new_directory, genbank)) as handle:
                for record in GenBank.parse(handle):
                    if 'full length' in record.comment:
                        print(record.accession, "OK")
                        for col in metadata_df:
                            metadata_df['Completness'] = metadata_df[
                                'Completness'].replace('partial', 'complete')
        else:
            continue

    print("Adding ICTV to metadata")

    metadata_df['accession_underscore'] = [
        x.split('.')[0] for x in metadata_df['Accession']
    ]  #make a list of accession without version
    metadata_df['accession_underscore'] = metadata_df[
        'accession_underscore'].astype(str)
    taxa = pd.ExcelFile(destination_ictv)  #read Excel file from the ICTV site
    df = taxa.parse("VMRb36")
    new_df = df.loc[
        df['Host Source'] ==
        'bacteria']  #create dataframe only with data where Host_source is bacteria
    ictv_df = new_df[[
        'Realm', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus',
        'Species', 'Genome composition', 'Host Source',
        'Virus GENBANK accession', 'Virus REFSEQ accession'
    ]].copy()  #this columns will be add to metadata
    ictv_df = ictv_df.rename(
        columns={
            'Species': 'Species_ictv',
            'Family': 'Family_ictv',
            'Genus': 'Genus_ictv',
            'Realm': 'Realm_ictv',
            'Kingdom': 'Kingdom_ictv',
            'Phylum': 'Phylum_ictv',
            'Class': 'Class_ictv',
            'Order': 'Order_ictv',
            'Genome composition': 'Genome_composition_ictv',
            'Host Source': 'Host_source_ictv',
            'Virus REFSEQ accession': 'Virus_REFSEQ_accession',
            'Virus GENBANK accession': 'Virus_GENBANK_accession'
        })  #rename columns to be sure later if it's ictv data
    #### merge dataframes to add ictv dataframe
    df1 = metadata_df.merge(ictv_df,
                            left_on='accession_underscore',
                            right_on='Virus_REFSEQ_accession',
                            how='left')
    metadata_df_all = df1.merge(ictv_df,
                                left_on='accession_underscore',
                                right_on='Virus_GENBANK_accession',
                                how='left')
    ##############################################################################################################
    ############## extract information from columns to have all ICTV data in one
    metadata_df_all.Kingdom_ictv_x = metadata_df_all.Kingdom_ictv_x.combine_first(
        metadata_df_all.Kingdom_ictv_y)
    metadata_df_all.Phylum_ictv_x = metadata_df_all.Phylum_ictv_x.combine_first(
        metadata_df_all.Phylum_ictv_y)
    metadata_df_all.Class_ictv_x = metadata_df_all.Class_ictv_x.combine_first(
        metadata_df_all.Class_ictv_y)
    metadata_df_all.Order_ictv_x = metadata_df_all.Order_ictv_x.combine_first(
        metadata_df_all.Order_ictv_y)
    metadata_df_all.Family_ictv_x = metadata_df_all.Family_ictv_x.combine_first(
        metadata_df_all.Family_ictv_y)
    metadata_df_all['Genome_composition_ictv_x'] = metadata_df_all[
        'Genome_composition_ictv_x'].combine_first(
            metadata_df_all['Genome_composition_ictv_y'])
    metadata_df_all['Host_source_ictv_x'] = metadata_df_all[
        'Host_source_ictv_x'].combine_first(
            metadata_df_all['Host_source_ictv_y'])
    metadata_df_all['Virus_GENBANK_accession_x'] = metadata_df_all[
        'Virus_GENBANK_accession_x'].combine_first(
            metadata_df_all['Virus_GENBANK_accession_y'])
    metadata_df_all['Virus_REFSEQ_accession_x'] = metadata_df_all[
        'Virus_REFSEQ_accession_x'].combine_first(
            metadata_df_all['Virus_REFSEQ_accession_y'])
    metadata_df_all.Realm_ictv_x = metadata_df_all.Realm_ictv_x.combine_first(
        metadata_df_all.Realm_ictv_y)
    metadata_df_all.Genus_ictv_x = metadata_df_all.Genus_ictv_x.combine_first(
        metadata_df_all.Genus_ictv_y)
    metadata_df_all.Species_ictv_x = metadata_df_all.Species_ictv_x.combine_first(
        metadata_df_all.Species_ictv_y)
    metadata_df_all = metadata_df_all.rename(columns={
        'Species': 'Species_ncbi',
        'Family': 'Family_ncbi',
        'Genus': 'Genus_ncbi'
    })

    metadata_df_all.drop([
        'Realm_ictv_y', 'Kingdom_ictv_y', 'Phylum_ictv_y', 'Class_ictv_y',
        'Order_ictv_y', 'Genus_ictv_y', 'Genome_composition_ictv_y',
        'Host_source_ictv_y', 'Virus_GENBANK_accession_y',
        'Virus_REFSEQ_accession_y', 'Family_ictv_y', 'Species_ictv_y'
    ],
                         axis=1,
                         inplace=True)  #drop columns with "_y" suffixes
    metadata_df_all.columns = metadata_df_all.columns.str.rstrip(
        '_x')  #drop suffix "_x"
    ####### IN ICTV THERE ARE SOME DUPLICATES
    df4 = metadata_df_all.groupby(['Accession'])  #group metadata dataframe
    dictio = {}  #empty dictionary to contain information about duplicates
    for x, y in df4:
        if len(y['Genus_ictv'].unique()
               ) > 1:  #some duplicates have different Genus_ictv values
            y['Genus_ictv'] = np.where(
                (len(y['Genus_ictv'].unique()) > 1),
                y['Genus_ictv'].unique()[0] + '/' +
                y['Genus_ictv'].unique()[1], y['Genus_ictv']
            )  #change this duplicates with adding both information into one row
            accession = list(y['Accession'])
            genus = list(y['Genus_ictv'])
            dictio = dict(zip(accession, genus))
        for key in dictio.keys():
            metadata_df_all["Genus_ictv"] = np.where(
                (metadata_df_all["Accession"] == key), dictio.values(),
                metadata_df_all["Genus_ictv"]
            )  #change duplicated Genus_ictv value in metadata dataframe
    metadata_df_all = metadata_df_all.drop_duplicates(
        subset='Accession',
        keep='first')  #drop duplicates and save first values
    ############### DELETE SHORT SEQUENCES FROM MULTIGENOME FASTA FILE AND FROM METADATA DATAFRAME
    long_sequences = []  # Setup an empty list
    for record in SeqIO.parse(os.path.join(new_directory, filename), "fasta"):
        if len(record.seq) > 3000:
            # Add this record to our list
            long_sequences.append(record)
    SeqIO.write(long_sequences, os.path.join(new_directory, filename), "fasta")
    metadata_df_all = metadata_df_all.drop(
        metadata_df_all.loc[metadata_df_all['Length'] < 3000].index,
        inplace=False)
    metadata_df_all = metadata_df_all.reset_index()

    metadata_df_all.to_csv(destination_csv)  #save metadata
Пример #16
0
def fetch_refseq(path, strain_lst, species_to_search='Mycoplasma genitalium'):
    """ download NCBI refseq GenBank file from strain list """
    import os, sys, time, glob, csv
    from Bio import GenBank
    from sf_miscellaneous import write_pickle
    #species_to_search
    ## fetch the newest refseq assembly_summary file
    os.system(
        'wget -c ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt > %sassembly_summary.txt'
        % path)
    with open('assembly_summary.txt', 'rb') as csvfile:
        outfile = 'downloadlink.txt'
        with open(path + outfile, 'wb') as output:
            csv_reader = csv.reader(csvfile, delimiter='\t')
            headers = csv_reader.next()
            for icsv_line in csv_reader:
                # species name and complete
                if species_to_search in icsv_line[
                        7] and 'Complete' in icsv_line[11]:
                    #os.system('wget -c %s/%s'%(icsv_line[19],'*_genomic.gbff.gz -P ./Refseq/Mt'))
                    output.write('%s/%s\n' %
                                 (icsv_line[19], '*_genomic.gbff.gz'))

    gbk_path = '%sinput_GenBank/' % path
    command_download = 'wget -c --input %sdownloadlink.txt -P %s' % (path,
                                                                     gbk_path)
    os.system(command_download)
    command_gunzip = 'gunzip %s*.gz' % gbk_path
    os.system(command_gunzip)
    for each_gbk_path in glob.iglob('%s*gbff*' % gbk_path):
        with open(each_gbk_path) as gbk_file:
            for record in GenBank.parse(gbk_file):
                print(each_gbk_path, record.accession[0])
                break
            os.system('mv %s %s%s.gbk' %
                      (each_gbk_path, gbk_path, record.accession[0]))

    if 0:
        os.chdir(path)
        species = glob.glob('*txt')[0].split('_list.')[0]
        os.system('rm *txt; rm *sh')
        os.system('gunzip *')
        while len(glob.glob('*.gz')) != 0:
            time.sleep(5)
        # rename gbk file
        for each_gbk_path in glob.iglob('*gbff*'):
            with open(each_gbk_path) as handle:
                print handle
                for record in GenBank.parse(handle):
                    print(each_gbk_path, record.accession[0])
                    break
            os.system('mv %s %s' % (each_gbk_path, record.accession[0]))
        for each_gbk_path in glob.iglob('*'):
            os.system('mv %s %s.gbk' % (each_gbk_path, each_gbk_path))
            #os.system('mv %s %s'%(each_gbk_path, each_gbk_path.split('.')[0]))
        os.system('ls *gbk > %s-RefSeq.txt; sed -i -- "s/.gbk//g" *txt' %
                  species)
        os.system('wc -l *txt ; ls *gbk |wc -l')
        path = '../../pan-genome-analysis/'
        os.system(
            'cp %srun-TestSet-template.sh %srun-%s.sh; sed -i -- "s/TestSet/%s/g" %srun-%s.sh'
            % (path, path, species, species, path, species))
        os.system('mv ../%s/ %sdata/' % (species, path))
Пример #17
0
def get_utr_coord(input_file):
    '''
    Finds the coordinates of 5'NTR and 3'NTR in file with nucleotide sequences
    in GenBank format
    
    Input:
        input_file - path to file with nt seqs in GenBank format
    Outtput:
        writes coordinates to a new file in the same directory as input_file
    '''
    
    
    out_file_name = os.path.splitext(input_file)[0] + '_coord.txt'
    out_file = open(out_file_name, 'w')

    out_file.write('acc,st_5utr,e_5utr,st_cds,e_cds,st_3utr,e_3utr\n')

    REG_JOIN = re.compile(r'[\d]+\.\.[\d]+')
    COD_START = re.compile(r'codon_start=[123]')
    with open(input_file) as handle:
        for seq in GenBank.parse(handle):

            # string that will be written to file
            st = ''
            # accession number of seq
            acc = seq.accession[0]
            # length of sequence
            seq_length = int(seq.size)
            start_5utr = 0
            end_5utr = 0
            start_c = 0
            end_c = 0
            start_3utr = 0
            end_3utr = 0

            cds_features = list()
            for feature in seq.features:
                '''
                добавить 5'UTR, 3'UTR
                '''
                if feature.key == 'CDS':
                    cds_features.append(feature)
            if cds_features:
            
                cds_0_str = str(cds_features[0])
                cs = COD_START.search(cds_0_str)
                if cs:
                    codon_start = int(cs.group()[-1]) - 1
                else:
                    codon_start = 0
                
                
                if cds_features[0].location.startswith('join'):
                    loc = REG_JOIN.findall(cds_features[0].location)
                    st1, _ = loc[0].split('..')
                else:
                    st1, _ = cds_features[0].location.split('..')
                st1 = int(st1.strip('<').strip('>'))
                
                
                
                if cds_features[-1].location.startswith('join'):
                    loc = REG_JOIN.findall(cds_features[-1].location)
                    _, e2 = loc[1].split('..')
                else:
                    _, e2 = cds_features[-1].location.split('..')
                e2 = int(e2.strip('<').strip('>'))

                start_c = st1 + codon_start
                end_c = e2
                if st1 > 1:
                    start_5utr = 1
                    end_5utr = st1 - 1 + codon_start
                elif codon_start > 1:
                    start_5utr = 1
                    end_5utr = st1 - 1 + codon_start
                if e2 < seq_length:
                    start_3utr = e2 + 1
                    end_3utr = seq_length
                l_coord = [str(x) for x in [acc,
                                            start_5utr,
                                            end_5utr,
                                            start_c,
                                            end_c,
                                            start_3utr,
                                            end_3utr]]
                st = ','.join(l_coord) + '\n'
                out_file.write(st)
            else:
                print('No CDS for {}'.format(acc))

    out_file.close()