def fetch_refseq(path, strain_lst, species_to_search='Mycoplasma genitalium'): """ download NCBI refseq GenBank file from strain list """ import os, sys, time, glob, csv from Bio import GenBank from sf_miscellaneous import write_pickle #species_to_search ## fetch the newest refseq assembly_summary file os.system('wget -c ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt > %sassembly_summary.txt'%path) with open('assembly_summary.txt','rb') as csvfile: outfile='downloadlink.txt' with open(path+outfile,'wb') as output: csv_reader = csv.reader(csvfile, delimiter='\t') headers = csv_reader.next() for icsv_line in csv_reader: # species name and complete if species_to_search in icsv_line[7] and 'Complete' in icsv_line[11]: #os.system('wget -c %s/%s'%(icsv_line[19],'*_genomic.gbff.gz -P ./Refseq/Mt')) output.write('%s/%s\n'%(icsv_line[19],'*_genomic.gbff.gz')) gbk_path='%sinput_GenBank/'%path command_download='wget -c --input %sdownloadlink.txt -P %s'%(path,gbk_path) os.system(command_download) command_gunzip='gunzip %s*.gz'%gbk_path os.system(command_gunzip) for each_gbk_path in glob.iglob('%s*gbff*'%gbk_path): with open(each_gbk_path) as gbk_file: for record in GenBank.parse(gbk_file): print(each_gbk_path,record.accession[0]) break os.system('mv %s %s%s.gbk'%(each_gbk_path, gbk_path, record.accession[0])) if 0: os.chdir(path) species=glob.glob('*txt')[0].split('_list.')[0] os.system('rm *txt; rm *sh') os.system('gunzip *') while len(glob.glob('*.gz'))!=0: time.sleep(5) # rename gbk file for each_gbk_path in glob.iglob('*gbff*'): with open(each_gbk_path) as handle: print handle for record in GenBank.parse(handle): print(each_gbk_path,record.accession[0]) break os.system('mv %s %s'%(each_gbk_path, record.accession[0])) for each_gbk_path in glob.iglob('*'): os.system('mv %s %s.gbk'%(each_gbk_path, each_gbk_path)) #os.system('mv %s %s'%(each_gbk_path, each_gbk_path.split('.')[0])) os.system('ls *gbk > %s-RefSeq.txt; sed -i -- "s/.gbk//g" *txt'%species) os.system('wc -l *txt ; ls *gbk |wc -l') path='../../pan-genome-analysis/' os.system('cp %srun-TestSet-template.sh %srun-%s.sh; sed -i -- "s/TestSet/%s/g" %srun-%s.sh'%(path,path,species,species,path,species)) os.system('mv ../%s/ %sdata/'%(species,path))
def main(): sorted_pos_tagged = {} db_handler = DatabaseHandler() for entry in os.scandir(shb_records): graph = rdflib.Graph() graph.load(entry.path) for s, p, o in graph: if p in predicate_whitelist and isinstance(o, rdflib.Literal): pos_tagged = mine_sentences(o) for p in pos_tagged: if p[1] in sorted_pos_tagged.keys(): sorted_pos_tagged[p[1]].append(p[0]) else: sorted_pos_tagged[p[1]] = [p[0]] for entry in os.scandir(gbk_records): with open(entry) as handle: for record in GenBank.parse(handle): graph = db_handler._db_util.db_mapping_calls[ "genbank"].generalise_get_results(record) for s, p, o in graph: if isinstance(o, rdflib.Literal): pos_tagged = mine_sentences(o) for p in pos_tagged: if p[1] in sorted_pos_tagged.keys(): sorted_pos_tagged[p[1]].append(p[0]) else: sorted_pos_tagged[p[1]] = [p[0]] f = open("summary.txt", "a+") for k, v in sorted_pos_tagged.items(): v = list(set(v)) print(k, v) print("\n") f.write(f'{k} - {"-".join(v)}\n')
def main(): try: os.mkdir(OUTDIR) except OSError: print 'Using existing directory %s' % OUTDIR with open(INFILE) as handle: records = [r for r in GenBank.parse(handle)] c = Counter([r.references[0].pubmed_id for r in records]) del c[''] pubs = c.keys() seqdict = {k:[r for r in records if r.references[0].pubmed_id == k] for k in pubs} for pub in pubs: qfas(seqdict[pub], OUTDIR+'/'+pub+'.fasta')
def load_samples(sequences): with open("data/genbank_sequences.gb") as handle: #Use biopython to parse the GenBank records for record in GenBank.parse(handle): #skip partial sequence records if ('partial' in record.definition): continue #For now id for a sample will include a truncated version of the country of origin, date collected #and the accession number of the record. accession = record.accession[0] source = findFeature(record, 'source') if source is not None: country = findItem(source, '/country=') col_date = findItem(source, '/collection_date=') id = accession if col_date is not None: dt = dateutil.parser.parse( col_date) # Time formatting is not consistent norm_date = dt.strftime(r'%Y-%m-%d') id = norm_date + '-' + id if country is not None: country = country.replace(':', ' ') id = country.split()[0][:7].strip() + '-' + id #For each CDS record genes = findAllFeature(record, 'CDS') for gene in genes: #First figure out the gene / protein name. Try both the product tag and gene tag #NOTE: We are not interested in the post translation non structural protein products. # We just process the orf1ab gene that has all of them embedded in it. product = findItem(gene, '/product=') if product is not None: gene_name = product.split()[0].upper() #A few proteins have aliases, map them to the standard form. if gene_name in geneAlias: gene_name = geneAlias[gene_name] if gene_name is None or gene_name not in validList: gene_name = findItem(gene, '/gene=') if (gene_name is not None and gene_name in geneAlias): gene_name = geneAlias[gene_name] if (gene_name not in validList): continue sequence = findItem(gene, '/translation=') if (id is not None and sequence is not None): if gene_name == 'ORF1AB': loadOrf1AB(sequences, id, sequence) else: sequences[gene_name].append(id + '|' + sequence)
def _get_organella(self, gb_file): """ Retrive the organelle from the genbank file, using the specific GenBank object, because SeqIO does not support this field """ organella = {} with open(gb_file, "r") as gbh: for record in GenBank.parse(gbh): accession = record.version for q in record.features[0].qualifiers: if q.key == "/organelle=": organelle = q.value.replace('"', '') organella[record.version] = organelle return organella
def parse_genebank_file(self): with open(self.gene_bank_file, "rU") as input_handle: for record in GenBank.parse(input_handle): #print("Name: %s, %i" % (record.name, len(record.features))) print(record.features) print(record.accession) print("----") print(record.gi) print("----") self.Acc = record.accession[0] if self.GI is None or len(self.GI) == 0: self.GI = "NA" if self.Acc is None or len(self.Acc) == 0: self.Acc = "NA"
def get_record_list(): """ Parses through and generates a list of dictionaries containing all required information about records. Makes calls to other functions to format location and record data """ with open(INPUT_PATH) as handle: record_list = [] for gbk_record in GenBank.parse(handle): record_list.append({ "locus": gbk_record.locus, "features": [feature for feature in gbk_record.features] }) return format_record_list(record_list)
def readGeneBankFile(file): """ Author: Marco Roelfes readGeneBankFile(file) file: genebank file readGeneBankFile reads genebank files to a dictionary(key: title(definition+accession+locus+source), value: sequence) returns dictionary with key: title and value: sequence """ sequences = {} #loop trough record to parse sequence in Seq object for gb_record in SeqIO.parse(open(file,"r"), "genbank") : #set seq seq = gb_record.seq #loop through file to parse headers for record in GenBank.parse(open(file)): #set title title=">" + record.definition + "accesion: " + record.accession[0] + " " +record.locus +" " + record.source #put title,sequence to dictionary sequences sequences[title] = seq return sequences
def main(): """ Main function""" # Get command-line arguments args = get_args() # If any of the arguments is missing, fill in with default values if args.in_file is None: in_file = 'ecis-screen_summary.txt' else: in_file = args.in_file if args.out_file is None: out_file = 'cis_locus_genes.txt' else: out_file = args.out_file if args.gbk_dir is None: gbk_dir = 'genomes' else: gbk_dir = args.gbk_dir # read eCIS-screen output file cis_data = read_cis_summary(in_file) candidate_proteins = [] with open(out_file, 'w') as outfile: for assembly in cis_data: # Make GenBank file path gbk_file = os.path.join(gbk_dir, assembly, assembly + '_genomic.gbff.gz') print('Now reading', gbk_file) for seq_record in GenBank.parse(gzip.open(gbk_file, 'rt')): if seq_record.accession[0] not in cis_data[assembly]: continue print('Sequence found', seq_record.accession[0]) # Get list of genes neighbors = get_neighbors( assembly, seq_record, cis_data[assembly][seq_record.accession[0]]) # Write list of genes to the output file outfile.write('\n'.join(neighbors) + '\n\n')
# Write logs to STDOUT consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logFormatter) rootLogger.addHandler(consoleHandler) dat = [] # Download viral genomes from NCBI for url in [ "ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/viral.1.protein.gpff.gz", "ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/viral.2.protein.gpff.gz", ]: run_cmds(["wget", url]) fp = url.split("/")[-1] with gzip.open(fp, "rt") as handle: for ix, record in enumerate(GenBank.parse(handle)): taxid = None product = None locus_tag = None coded_by = None genome = None genome_range = None for feature in record.features: for qualifier in feature.qualifiers: if feature.key == "source": if qualifier.key == "/db_xref=": taxid = qualifier.value.strip('"') taxid = taxid.replace("taxon:", "") elif feature.key == "Protein": if qualifier.key == "/product=":
idnum = (record['IdList'][1]) else: idnum = (record['IdList'][0]) handle.close() # Retrive the record from Enterz protein database # using the ID number saved from the previous step # save the result as GenBank file handle2 = Entrez.efetch(db="protein", id=idnum, rettype="gb", retmode="text") # Parse through the GenBank file to find data needed for gene_info table # like version, length and full name of the gene and stor the result # into new file to be able to insert them easily for record in GenBank.parse(handle2): version = record.version + "\t" length = record.size + "\t" fullinfo = (record._definition_line()).strip() fullinfo = fullinfo.replace("DEFINITION ", "").strip() fullinfo = re.sub("\[Chlamydia.*\n*.*].", "", fullinfo).strip() insert = locus + version + symbol + length + fullinfo + '\n' outFile.write(insert) handle2.close() # Close the cursor, connection and outFile cursor.close() conn.close() outFile.close()
def _read_file(fn): records = [] with open(fn) as handle: for record in GenBank.parse(handle): records.append(record) return records
#!/usr/bin/python3 import sys import Bio from Bio import GenBank #for the Restriction_Enzyme table Enz_name = {'EcoRI':{"GAATTC"}, 'BamHI':{"GGATCC"},'BsuMI':{"CTCGAG"}} #create parser parser = GenBank.RecordParser() record = parser.parse(open("chrom_CDS_8")) #create empty list to put all information into List = [] Insert = {} for record in GenBank.parse(open("chrom_CDS_8")): """ To extract each piece of information and insert values from each gene one at a time empty dictionary """ #obtain accession and DNA sequence Insert['accession'] = record.accession Insert['seq'] = record.sequence #create flags for genes without information in entry has_gene=0 has_loc=0 has_pp=0 has_ex_strt=0 has_ex_end = 0 has_aa = 0 #create empty lists to allow for multiple entries of exons exon_start_list = []
def process_genbank(gb_filename): """ Process GenBank file name :param gb_filename: GenBank filename :return: dict of metadata, SeqRecord """ info = {'File': None, 'Virus name': None, 'Accession ID': None, 'Type': '', 'Passage details/history': '', 'Collection date': '', 'Location': '', 'Host': '', 'Additional location information': '', 'Gender': '', 'Patient age' : '', 'Patient status': '', 'Specimen source': '', 'Additional host information': '', 'Outbreak': '', 'Last vaccinated': '', 'Treatment': '', 'Sequencing technology': '', 'Assembly method': '', 'Coverage': '', 'Comment': '', 'Originating lab': '', 'Address': '', 'Sample ID given by the sample provider': '', 'Submitting lab': '', 'Address 1': '', 'Sample ID given by the submitting laboratory': '', 'Authors': '', 'Submitter': '', 'Submission Date': '', 'Address 2': '', 'Continent': '', 'Country': ''} reader = GenBank.parse(open(gb_filename)) r = next(reader) # In [55]: r.features[0] # Out[55]: Feature(key='source', location='1..29688') # In [56]: r.features[0].qualifiers # Out[56]: # [Qualifier(key='/organism=', value='"Severe acute respiratory syndrome coronavirus 2"'), # Qualifier(key='/mol_type=', value='"genomic RNA"'), # Qualifier(key='/isolate=', value='"SARS-CoV-2/WA-UW297/human/2020/USA"'), # Qualifier(key='/host=', value='"H**o sapiens"'), # Qualifier(key='/db_xref=', value='"taxon:2697049"'), # Qualifier(key='/country=', value='"USA: WA"'), # Qualifier(key='/collection_date=', value='"2020-03-15"')] info['File'] = gb_filename info['Accession ID'] = r.version info['Virus name'] = r.locus # later can rewrite if "/isolate=" is available info['Submission Date'] = r.date assert r.features[0].key=='source' q = dict((x.key,x.value) for x in r.features[0].qualifiers) if '/collection_date=' in q: info['Collection date'] = q['/collection_date='] if '/host=' in q: host, gender, age = process_host(q['/host=']) info['Host'] = host info['Gender'] = gender info['Age'] = age if '/country=' in q: location, continent, country = process_country(q['/country=']) info['Location'] = location info['Continent'] = continent info['Country'] = country if '/isolation_source=' in q: info['Specimen source'] = q['/isolation_source='] if '/isolate=' in q: info['Virus name'] = q['/isolate='] # In [94]: print(r.references[0]) # REFERENCE 1 (bases 1 to 29838) # AUTHORS Chan,J.F.-W., Yuan,S., Kok,K.H., To,K.K.-W., Chu,H., Yang,J., # Xing,F., Liu,J., Yip,C.C.-Y., Poon,R.W.-S., Tsai,H.W., Lo,S.K.-F., # Chan,K.H., Poon,V.K.-M., Chan,W.M., Ip,J.D., Cai,J.P., # Cheng,V.C.-C., Chen,H., Hui,C.K.-M. and Yuen,K.Y. # TITLE A familial cluster of pneumonia associated with the 2019 novel # coronavirus indicating person-to-person transmission: a study of a # family cluster # JOURNAL Lancet (2020) In press # REMARK Publication Status: Available-Online prior to print info['Authors'] = r.references[0].authors info['Address'] = r.references[0].journal if len(r.references)>1: info['Submitter'] = r.references[1].authors info['Address 2'] = r.references[1].journal # In [113]: r.structured_comment # Out[113]: # OrderedDict([('Assembly-Data', # OrderedDict([('Assembly Method', 'minimap2 v. 14 Jan 2020'), # ('Sequencing Technology', 'Nanopore')]))]) try: if 'Assembly-Data' in r.structured_comment: x = r.structured_comment['Assembly-Data'] if 'Sequencing Technology' in x: info['Sequencing technology'] = x['Sequencing Technology'] if 'Assembly Method' in x: info['Assembly method'] = x['Assembly Method'] except: pass # In [16]: r.taxonomy # Out[16]: # ['Viruses', # 'Riboviria', # 'Nidovirales', # 'Cornidovirineae', # 'Coronaviridae', # 'Orthocoronavirinae', # 'Betacoronavirus', # 'Sarbecovirus'] info['Type'] = r.taxonomy[-2] _id = r.version if info['Virus name']!='': _id += '|' + info['Virus name'] else: _id += '|' + r.definition return info, SeqRecord(r.sequence, id=_id)
def get_ncbi_genomes(args): ''' Main function. It takes genomes and adds information to metadata such as sequence length, completeness, and information from ICTV. ''' ####### Name variables for ease of use ###### accessions = args.input metadata = args.metadata directory = args.directory filename = args.output check(accessions, metadata) Entrez.email = args.email ictv = args.ictv genbank = args.genbank ############################################## list_of_lists = acc_list(args.input) #Load split accession items folder = os.path.join( directory + 'database_' + datetime.now().strftime('%Y-%m-%d') ) #create PATH to new folder in path with all databases os.makedirs(folder) #create this folder old_directory = os.getcwd() #check your current directory new_directory = folder for file in os.listdir(old_directory): #copy files to new directory if file.endswith(".acc"): source_acc = os.path.join(old_directory, accessions) destination_acc = os.path.join(new_directory, accessions) shutil.copyfile(source_acc, destination_acc) elif file.endswith(".csv"): source_csv = os.path.join(old_directory, metadata) destination_csv = os.path.join(new_directory, metadata) shutil.copyfile(source_csv, destination_csv) elif file.endswith(".xlsx"): source_ictv = os.path.join(old_directory, ictv) destination_ictv = os.path.join(new_directory, ictv) shutil.copyfile(source_ictv, destination_ictv) else: continue print( '************************\nMoved files to new directory\n************************' ) for elem in list_of_lists: #for every 7000 genomes net_handle = Entrez.efetch(db="nucleotide", id=elem, rettype="fasta", retmode="text") out_handle = open(os.path.join(new_directory, filename), "a+") #open new file out_handle.write(net_handle.read()) out_handle.close() net_handle.close() print("Part of genomes saved") print('All genomes saved\n************************') print('Starting processing \n************************') metadata_df = pd.read_csv(os.path.join( new_directory, metadata)) #read dataframe with metadata metadata_accession = metadata_df['Accession'].tolist( ) #save all rows from metadata column "Accession" to list headers, seqs = fastaParser(os.path.join( new_directory, filename)) #read headers and sequences from multigenome fasta file flat_seqs = [item for sublist in seqs for item in sublist] genomes = list(zip(headers, flat_seqs)) ncount = [seq.count('N') for header, seq in genomes ] #count how many "N"'s in every sequence accession = re.findall(r'[A-Z]+\w?\d{5,8}\.\d', str(headers)) #find accessions ids in headers complete = [item[0] for item in genomes if 'complete' in item[0] ] #make list with headers if "complete" is in element accession_complete = re.findall( r'[A-Z]+\w?\d{5,8}\.\d', str(complete)) #find accessions ids in headers partial = [item[0] for item in genomes if 'partial' in item[0] ] #create list with headers if "partial" is in element accession_partial = re.findall( r'[A-Z]+\w?\d{5,8}\.\d', str(partial)) #find accessions ids in headers length = [len(seq) for header, seq in genomes ] #make a list with length of sequences #### DICTIONARIES -> They'll be add to metadata dataframe partial_d = {k: 'partial' for k in accession_partial} complete_d = {k: 'complete' for k in accession_complete} length_d = dict(zip(accession, length)) ncount_d = dict(zip(accession, ncount)) updated_completness = complete_d.copy() updated_completness.update(partial_d) print('************************\nChecking all removed genomes') url_list = [] #empty list to contain urls for elem in metadata_accession: #for every accession record url = "https://www.ncbi.nlm.nih.gov/nuccore/" + elem #create path to website url_list.append(url) #add url path to list the_word = 'Record removed' count_list = [ ] #empty list to contain information if record was removed from the NCBI site for url in url_list: r = requests.get(url, allow_redirects=False) soup = BeautifulSoup(r.content.lower(), 'lxml') words = soup.find_all(text=lambda text: text and the_word.lower() in text) #find string from the_word value count = len( words) #how many strings from the_word was on the NCBI site count_list.append(count) #add count value to list print('\nUrl: {}\ncontains {} of words: {}'.format( url, count, the_word)) #removed_df = pd.DataFrame(list(zip(metadata_accession, count_list)), columns =['Accession', 'Removed']) #####REMOVED DICTIONARY removed_d = ( dict(zip(metadata_accession, count_list)) ) #dictionary with accessions and information if record was removed print('************************\nAdding information to metadata') ############ ADD ALL DICTIONARIES TO METADATA metadata_df['Nuc_length'] = metadata_df['Accession'].map(length_d) metadata_df['Ncount'] = metadata_df['Accession'].map(ncount_d) metadata_df['Removed'] = metadata_df['Accession'].map(removed_d) metadata_df['Completness'] = metadata_df['Accession'].map( updated_completness) partial_df = metadata_df.loc[ metadata_df['Completness'] == 'partial'] #check if there is any partial genome in data partial_list = partial_df['Accession'].tolist() print( '************************\nDownload genbank file and change information about completness' ) list_of_lists = partial_list ##### IF THERE IS ANY "PARTIAL" GENOME DOWNLOAD GENBANK FILE AND CHECK IF IT'S GOT "full length" in comment section. for elem in list_of_lists: if len(elem) > 0: net_handle = Entrez.efetch(db="nucleotide", id=elem, rettype="gb", retmode="text") out_handle = open(os.path.join(new_directory, genbank), "a+") out_handle.write(net_handle.read()) out_handle.close() net_handle.close() print(elem, "Saved") with open(os.path.join(new_directory, genbank)) as handle: for record in GenBank.parse(handle): if 'full length' in record.comment: print(record.accession, "OK") for col in metadata_df: metadata_df['Completness'] = metadata_df[ 'Completness'].replace('partial', 'complete') else: continue print("Adding ICTV to metadata") metadata_df['accession_underscore'] = [ x.split('.')[0] for x in metadata_df['Accession'] ] #make a list of accession without version metadata_df['accession_underscore'] = metadata_df[ 'accession_underscore'].astype(str) taxa = pd.ExcelFile(destination_ictv) #read Excel file from the ICTV site df = taxa.parse("VMRb36") new_df = df.loc[ df['Host Source'] == 'bacteria'] #create dataframe only with data where Host_source is bacteria ictv_df = new_df[[ 'Realm', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'Genome composition', 'Host Source', 'Virus GENBANK accession', 'Virus REFSEQ accession' ]].copy() #this columns will be add to metadata ictv_df = ictv_df.rename( columns={ 'Species': 'Species_ictv', 'Family': 'Family_ictv', 'Genus': 'Genus_ictv', 'Realm': 'Realm_ictv', 'Kingdom': 'Kingdom_ictv', 'Phylum': 'Phylum_ictv', 'Class': 'Class_ictv', 'Order': 'Order_ictv', 'Genome composition': 'Genome_composition_ictv', 'Host Source': 'Host_source_ictv', 'Virus REFSEQ accession': 'Virus_REFSEQ_accession', 'Virus GENBANK accession': 'Virus_GENBANK_accession' }) #rename columns to be sure later if it's ictv data #### merge dataframes to add ictv dataframe df1 = metadata_df.merge(ictv_df, left_on='accession_underscore', right_on='Virus_REFSEQ_accession', how='left') metadata_df_all = df1.merge(ictv_df, left_on='accession_underscore', right_on='Virus_GENBANK_accession', how='left') ############################################################################################################## ############## extract information from columns to have all ICTV data in one metadata_df_all.Kingdom_ictv_x = metadata_df_all.Kingdom_ictv_x.combine_first( metadata_df_all.Kingdom_ictv_y) metadata_df_all.Phylum_ictv_x = metadata_df_all.Phylum_ictv_x.combine_first( metadata_df_all.Phylum_ictv_y) metadata_df_all.Class_ictv_x = metadata_df_all.Class_ictv_x.combine_first( metadata_df_all.Class_ictv_y) metadata_df_all.Order_ictv_x = metadata_df_all.Order_ictv_x.combine_first( metadata_df_all.Order_ictv_y) metadata_df_all.Family_ictv_x = metadata_df_all.Family_ictv_x.combine_first( metadata_df_all.Family_ictv_y) metadata_df_all['Genome_composition_ictv_x'] = metadata_df_all[ 'Genome_composition_ictv_x'].combine_first( metadata_df_all['Genome_composition_ictv_y']) metadata_df_all['Host_source_ictv_x'] = metadata_df_all[ 'Host_source_ictv_x'].combine_first( metadata_df_all['Host_source_ictv_y']) metadata_df_all['Virus_GENBANK_accession_x'] = metadata_df_all[ 'Virus_GENBANK_accession_x'].combine_first( metadata_df_all['Virus_GENBANK_accession_y']) metadata_df_all['Virus_REFSEQ_accession_x'] = metadata_df_all[ 'Virus_REFSEQ_accession_x'].combine_first( metadata_df_all['Virus_REFSEQ_accession_y']) metadata_df_all.Realm_ictv_x = metadata_df_all.Realm_ictv_x.combine_first( metadata_df_all.Realm_ictv_y) metadata_df_all.Genus_ictv_x = metadata_df_all.Genus_ictv_x.combine_first( metadata_df_all.Genus_ictv_y) metadata_df_all.Species_ictv_x = metadata_df_all.Species_ictv_x.combine_first( metadata_df_all.Species_ictv_y) metadata_df_all = metadata_df_all.rename(columns={ 'Species': 'Species_ncbi', 'Family': 'Family_ncbi', 'Genus': 'Genus_ncbi' }) metadata_df_all.drop([ 'Realm_ictv_y', 'Kingdom_ictv_y', 'Phylum_ictv_y', 'Class_ictv_y', 'Order_ictv_y', 'Genus_ictv_y', 'Genome_composition_ictv_y', 'Host_source_ictv_y', 'Virus_GENBANK_accession_y', 'Virus_REFSEQ_accession_y', 'Family_ictv_y', 'Species_ictv_y' ], axis=1, inplace=True) #drop columns with "_y" suffixes metadata_df_all.columns = metadata_df_all.columns.str.rstrip( '_x') #drop suffix "_x" ####### IN ICTV THERE ARE SOME DUPLICATES df4 = metadata_df_all.groupby(['Accession']) #group metadata dataframe dictio = {} #empty dictionary to contain information about duplicates for x, y in df4: if len(y['Genus_ictv'].unique() ) > 1: #some duplicates have different Genus_ictv values y['Genus_ictv'] = np.where( (len(y['Genus_ictv'].unique()) > 1), y['Genus_ictv'].unique()[0] + '/' + y['Genus_ictv'].unique()[1], y['Genus_ictv'] ) #change this duplicates with adding both information into one row accession = list(y['Accession']) genus = list(y['Genus_ictv']) dictio = dict(zip(accession, genus)) for key in dictio.keys(): metadata_df_all["Genus_ictv"] = np.where( (metadata_df_all["Accession"] == key), dictio.values(), metadata_df_all["Genus_ictv"] ) #change duplicated Genus_ictv value in metadata dataframe metadata_df_all = metadata_df_all.drop_duplicates( subset='Accession', keep='first') #drop duplicates and save first values ############### DELETE SHORT SEQUENCES FROM MULTIGENOME FASTA FILE AND FROM METADATA DATAFRAME long_sequences = [] # Setup an empty list for record in SeqIO.parse(os.path.join(new_directory, filename), "fasta"): if len(record.seq) > 3000: # Add this record to our list long_sequences.append(record) SeqIO.write(long_sequences, os.path.join(new_directory, filename), "fasta") metadata_df_all = metadata_df_all.drop( metadata_df_all.loc[metadata_df_all['Length'] < 3000].index, inplace=False) metadata_df_all = metadata_df_all.reset_index() metadata_df_all.to_csv(destination_csv) #save metadata
def fetch_refseq(path, strain_lst, species_to_search='Mycoplasma genitalium'): """ download NCBI refseq GenBank file from strain list """ import os, sys, time, glob, csv from Bio import GenBank from sf_miscellaneous import write_pickle #species_to_search ## fetch the newest refseq assembly_summary file os.system( 'wget -c ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt > %sassembly_summary.txt' % path) with open('assembly_summary.txt', 'rb') as csvfile: outfile = 'downloadlink.txt' with open(path + outfile, 'wb') as output: csv_reader = csv.reader(csvfile, delimiter='\t') headers = csv_reader.next() for icsv_line in csv_reader: # species name and complete if species_to_search in icsv_line[ 7] and 'Complete' in icsv_line[11]: #os.system('wget -c %s/%s'%(icsv_line[19],'*_genomic.gbff.gz -P ./Refseq/Mt')) output.write('%s/%s\n' % (icsv_line[19], '*_genomic.gbff.gz')) gbk_path = '%sinput_GenBank/' % path command_download = 'wget -c --input %sdownloadlink.txt -P %s' % (path, gbk_path) os.system(command_download) command_gunzip = 'gunzip %s*.gz' % gbk_path os.system(command_gunzip) for each_gbk_path in glob.iglob('%s*gbff*' % gbk_path): with open(each_gbk_path) as gbk_file: for record in GenBank.parse(gbk_file): print(each_gbk_path, record.accession[0]) break os.system('mv %s %s%s.gbk' % (each_gbk_path, gbk_path, record.accession[0])) if 0: os.chdir(path) species = glob.glob('*txt')[0].split('_list.')[0] os.system('rm *txt; rm *sh') os.system('gunzip *') while len(glob.glob('*.gz')) != 0: time.sleep(5) # rename gbk file for each_gbk_path in glob.iglob('*gbff*'): with open(each_gbk_path) as handle: print handle for record in GenBank.parse(handle): print(each_gbk_path, record.accession[0]) break os.system('mv %s %s' % (each_gbk_path, record.accession[0])) for each_gbk_path in glob.iglob('*'): os.system('mv %s %s.gbk' % (each_gbk_path, each_gbk_path)) #os.system('mv %s %s'%(each_gbk_path, each_gbk_path.split('.')[0])) os.system('ls *gbk > %s-RefSeq.txt; sed -i -- "s/.gbk//g" *txt' % species) os.system('wc -l *txt ; ls *gbk |wc -l') path = '../../pan-genome-analysis/' os.system( 'cp %srun-TestSet-template.sh %srun-%s.sh; sed -i -- "s/TestSet/%s/g" %srun-%s.sh' % (path, path, species, species, path, species)) os.system('mv ../%s/ %sdata/' % (species, path))
def get_utr_coord(input_file): ''' Finds the coordinates of 5'NTR and 3'NTR in file with nucleotide sequences in GenBank format Input: input_file - path to file with nt seqs in GenBank format Outtput: writes coordinates to a new file in the same directory as input_file ''' out_file_name = os.path.splitext(input_file)[0] + '_coord.txt' out_file = open(out_file_name, 'w') out_file.write('acc,st_5utr,e_5utr,st_cds,e_cds,st_3utr,e_3utr\n') REG_JOIN = re.compile(r'[\d]+\.\.[\d]+') COD_START = re.compile(r'codon_start=[123]') with open(input_file) as handle: for seq in GenBank.parse(handle): # string that will be written to file st = '' # accession number of seq acc = seq.accession[0] # length of sequence seq_length = int(seq.size) start_5utr = 0 end_5utr = 0 start_c = 0 end_c = 0 start_3utr = 0 end_3utr = 0 cds_features = list() for feature in seq.features: ''' добавить 5'UTR, 3'UTR ''' if feature.key == 'CDS': cds_features.append(feature) if cds_features: cds_0_str = str(cds_features[0]) cs = COD_START.search(cds_0_str) if cs: codon_start = int(cs.group()[-1]) - 1 else: codon_start = 0 if cds_features[0].location.startswith('join'): loc = REG_JOIN.findall(cds_features[0].location) st1, _ = loc[0].split('..') else: st1, _ = cds_features[0].location.split('..') st1 = int(st1.strip('<').strip('>')) if cds_features[-1].location.startswith('join'): loc = REG_JOIN.findall(cds_features[-1].location) _, e2 = loc[1].split('..') else: _, e2 = cds_features[-1].location.split('..') e2 = int(e2.strip('<').strip('>')) start_c = st1 + codon_start end_c = e2 if st1 > 1: start_5utr = 1 end_5utr = st1 - 1 + codon_start elif codon_start > 1: start_5utr = 1 end_5utr = st1 - 1 + codon_start if e2 < seq_length: start_3utr = e2 + 1 end_3utr = seq_length l_coord = [str(x) for x in [acc, start_5utr, end_5utr, start_c, end_c, start_3utr, end_3utr]] st = ','.join(l_coord) + '\n' out_file.write(st) else: print('No CDS for {}'.format(acc)) out_file.close()