def test_acba_simple_with_gbk_without_promoter(self): replicon_filename = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' command = "integron_finder --outdir {out_dir} --gbk {replicon}".format(out_dir=self.out_dir, replicon=self.find_data( os.path.join('Replicons', '{}.fst'.format(replicon_filename)) ) ) with self.catch_io(out=True, err=True): main(command.split()[1:], loglevel='WARNING') output_dirname = 'Results_Integron_Finder_{}'.format(replicon_filename) test_result_dir = os.path.join(self.out_dir, output_dirname) gbk = '{}.gbk'.format(replicon_id) expected_gbk = self.find_data(os.path.join(output_dirname + ".wo_promoter", gbk)) gbk_test = os.path.join(test_result_dir, gbk) expected_gbk = SeqIO.read(expected_gbk, 'gb') gbk_test = SeqIO.read(gbk_test, 'gb') self.assertSeqRecordEqual(expected_gbk, gbk_test) output_filename = '{}.integrons'.format(replicon_filename) expected_result_path = self.find_data(os.path.join(output_dirname + ".wo_promoter", output_filename)) test_result_path = os.path.join(test_result_dir, output_filename) self.assertIntegronResultEqual(expected_result_path, test_result_path)
def test_acba_annot(self): replicon_filename = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' command = "integron_finder --outdir {out_dir} --func-annot --path-func-annot {annot_bank} --promoter-attI " \ "--gbk --keep-tmp " \ "{replicon}".format(out_dir=self.out_dir, annot_bank=self.resfams_dir, replicon=self.find_data(os.path.join('Replicons', '{}.fst'.format(replicon_filename))) ) with self.catch_io(out=True, err=False): main(command.split()[1:], loglevel='WARNING') result_dir = os.path.join(self.out_dir, 'Results_Integron_Finder_{}'.format(replicon_filename)) gbk = '{}.gbk'.format(replicon_id) expected_gbk = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), gbk)) gbk_test = os.path.join(result_dir, gbk) expected_gbk = SeqIO.read(expected_gbk, 'gb') gbk_test = SeqIO.read(gbk_test, 'gb') self.assertSeqRecordEqual(expected_gbk, gbk_test) output_filename = '{}.integrons'.format(replicon_filename) expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), output_filename)) test_result_path = os.path.join(result_dir, output_filename) self.assertIntegronResultEqual(expected_result_path, test_result_path) output_filename = os.path.join('tmp_{}'.format(replicon_id), replicon_id + '_Resfams_fa_table.res') expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), output_filename)) test_result_path = os.path.join(result_dir, output_filename) self.assertHmmEqual(expected_result_path, test_result_path)
def test_longer_locus_line(self): """Check that we can read and write files with longer locus lines.""" # Create example file from existing file with open(path.join("GenBank", "DS830848.gb"), 'r') as inhandle: data = inhandle.readlines() data[0] = "LOCUS AZZZAA021234567891234 2147483647 bp DNA linear PRI 15-OCT-2018\n" # Create memory file from modified genbank file in_tmp = StringIO() in_tmp.writelines(data) in_tmp.seek(0) with warnings.catch_warnings(): warnings.simplefilter("ignore") in_tmp.seek(0) record = SeqIO.read(in_tmp, 'genbank') # Create temporary output memory file out_tmp = StringIO() SeqIO.write(record, out_tmp, 'genbank') # Check that the written file can be read back in out_tmp.seek(0) record_in = SeqIO.read(out_tmp, 'genbank') self.assertEqual(record_in.id, "DS830848.1") self.assertEqual(record_in.name, "AZZZAA021234567891234") self.assertEqual(len(record_in.seq), 2147483647)
def compress(self,filename,cd,pos): filename.compdeep=cd-1 filename.comptype=pos[0:len(pos)-1] if filename.ext=='.gb': rec=SeqIO.read(filename.get_name(),"genbank") ln=len(rec.seq) else: rec=SeqIO.read(filename.get_name(),"fasta") ln=len(rec.seq) filename.compdeep=cd filename.comptype=pos numpos=int(pos[len(pos)-1]) compstep=self.compopt['compstep'] resseq=Seq('',rec.seq.alphabet) res=open(filename.get_name(),'w') oligolist=[] self.complete_oligolist(oligolist,'',compstep) for i in xrange(0,ln-ln%compstep,compstep): if str(rec.seq[i:i+compstep]).lower() in oligolist: resseq+=rec.seq[i:i+compstep][numpos] rec.seq=resseq if filename.ext=='.gb': SeqIO.write(rec,res,"genbank") else: SeqIO.write(rec,res,"fasta") res.close() return resseq
def collect_proteomes_and_annotaitons(input_dir): proteomes = [] annotations = [] files = listdir(input_dir) if not files: interrupt('Directory contains no files.') for f in (join(input_dir, f) for f in files if isfile(join(input_dir, f))): if '.' in f and splitext(f)[1] in ['.fasta', '.faa', '.fa', '.fsa']: try: log.debug(' Checking if %s is fasta.' % f) next(SeqIO.parse(f, 'fasta')) except ValueError, e: pass else: proteomes.append(f) continue if '.' in f and splitext(f)[1] in ['.gb', '.genbank', '.gbk']: try: log.debug(' Checking if %s is genbank.' % f) SeqIO.read(f, 'genbank') except Exception, e: log.debug(str(e) + ', ' + f) else: annotations.append(f)
def testRemovalOfSuffix(self): """ A sequence that is a suffix of another is removed. """ s1 = SeqIO.read(StringIO('>s1\nagtcagtcagtc'), 'fasta') s2 = SeqIO.read(StringIO('>s2\ncagtc'), 'fasta') self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2])), [s1])
def main(): usage = "usage: %prog [Options]" parser = OptionParser(usage=usage) parser.add_option("-a", metavar="EMBL-a", help="First EMBL file", action="store", type="string", dest="first_embl") parser.add_option("-b", metavar="EMBL-b", help="Second EMBL file to compare", action="store", dest="second_embl") parser.add_option("--merge", help="To transfer /product of identical annotations into a merged file", action="store_true", dest="merge") (options, args) = parser.parse_args() # Print help if no argument given if util.printHelp(options): parser.print_help() sys.exit() first_record = SeqIO.read(open(options.first_embl), "embl") second_record = SeqIO.read(open(options.second_embl), "embl") print "Analysis of EMBL features A from %s" % options.first_embl print "Analysis of EMBL features B from %s" % options.second_embl stat(first_record) if options.merge: merged_record = transfer(first_record, second_record) # Write out genbank file SeqIO.write([merged_record], open("merged.embl", "w"), "embl")
def test_overlapping_clip(self): with open("Roche/greek.sff", "rb") as handle: record = next(SeqIO.parse(handle, "sff")) self.assertEqual(len(record), 395) s = str(record.seq.lower()) # Apply overlapping clipping record.annotations['clip_qual_left'] = 51 record.annotations['clip_qual_right'] = 44 record.annotations['clip_adapter_left'] = 50 record.annotations['clip_adapter_right'] = 75 self.assertEqual(len(record), 395) self.assertEqual(len(record.seq), 395) # Save the clipped record... h = BytesIO() count = SeqIO.write(record, h, "sff") # Now reload it... h.seek(0) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", BiopythonParserWarning) record = SeqIO.read(h, "sff") self.assertEqual(len(w), 1, w) self.assertEqual(record.annotations['clip_qual_left'], 51) self.assertEqual(record.annotations['clip_qual_right'], 44) self.assertEqual(record.annotations['clip_adapter_left'], 50) self.assertEqual(record.annotations['clip_adapter_right'], 75) self.assertEqual(len(record), 395) self.assertEqual(s, str(record.seq.lower())) # And check with trimming applied... h.seek(0) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", BiopythonParserWarning) record = SeqIO.read(h, "sff-trim") self.assertEqual(len(w), 1, w) self.assertEqual(len(record), 0)
def test_structured_comment_parsing(self): """Structued comment parsing.""" # GISAID_EpiFlu(TM)Data, HM138502.gbk has both 'comment' and 'structured_comment' record = SeqIO.read(path.join('GenBank', 'HM138502.gbk'), 'genbank') self.assertEqual(record.annotations['comment'], "Swine influenza A (H1N1) virus isolated during human swine flu\noutbreak of 2009.") self.assertEqual(record.annotations['structured_comment']['GISAID_EpiFlu(TM)Data']['Lineage'], 'swl') self.assertEqual(len(record.annotations['structured_comment']['GISAID_EpiFlu(TM)Data']), 3) with open(path.join('GenBank', 'HM138502_output.gbk'), "r") as ifile: self.assertEqual(record.format("gb"), ifile.read()) # FluData structured comment record = SeqIO.read(path.join('GenBank', 'EU851978.gbk'), 'genbank') self.assertEqual(record.annotations['structured_comment']['FluData']['LabID'], '2008704957') self.assertEqual(len(record.annotations['structured_comment']['FluData']), 5) with open(path.join('GenBank', 'EU851978_output.gbk'), "r") as ifile: self.assertEqual(record.format("gb"), ifile.read()) # Assembly-Data structured comment record = SeqIO.read(path.join('GenBank', 'KF527485.gbk'), 'genbank') self.assertEqual(record.annotations['structured_comment']['Assembly-Data']['Assembly Method'], 'Lasergene v. 10') self.assertEqual(len(record.annotations['structured_comment']['Assembly-Data']), 2) with open(path.join('GenBank', 'KF527485_output.gbk'), "r") as ifile: self.assertEqual(record.format("gb"), ifile.read()) # No structured comment in NC_000932.gb, just a regular comment record = SeqIO.read(path.join('GenBank', 'NC_000932.gb'), 'genbank') self.assertFalse("structured_comment" in record.annotations) self.assertEqual(record.annotations['comment'], 'REVIEWED REFSEQ: This record has been curated by NCBI staff. The\n' 'reference sequence was derived from AP000423.\n' 'COMPLETENESS: full length.')
def load_file(filename, file_format="fasta"): """ Load sequence from file and returns sequence as Bio.Seq object :param filename: String; Path and filename of input sequence file :param file_format: String; Format to be used. Refer to Biopython docs for available formats. Defaults to 'fasta' """ content = None try: # assume sequence is DNA content = SeqIO.read(filename, file_format, IUPAC.ambiguous_dna) except ValueError as error: # if this fails, try RNA instead print('ERROR: {}'.format(error)) try: content = SeqIO.read(filename, file_format, IUPAC.ambiguous_rna) except ValueError as error: # if this fails, too, raise exception and exit with error code 1 print('ERROR: {}'.format(error)) exit(1) # if some kind of data could be read, return the sequence object if content: seq = content.seq return seq # else return None else: return None
def generateReads(refGene): currentChr = refGene[0][2] sequence=SeqIO.read('%s/%s.fa'%(options.chromosomes,currentChr),'fasta') print "Generating reads for chromosome " + sequence.description f=open(options.output,'w') countChr = 0 for gene in refGene: if gene[2] != currentChr: print '%d reads generated for chromosome %s'%(countChr,currentChr) currentChr = gene[2] sequence=SeqIO.read('%s/%s.fa'%(options.chromosomes,currentChr),'fasta') countChr=0 print "Generating reads for chromosome " + sequence.description strand = gene[3] numExons = int(gene[8]) exonStarts = gene[9].split(',') exonEnds = gene[10].split(',') exons = [] for i in range(numExons): exons.append([int(exonStarts[i]),int(exonEnds[i])]) exons.sort(key=lambda e:(e[0],e[1])) if options.mode=='au': countChr += adjUni(exons,strand,sequence,f) elif options.mode=='an': pass elif options.mode=='cu': pass elif options.mode=='cn': pass f.close()
def test_genbank_date_list(self): """Check if date lists are handled correctly""" sequence_object = Seq("ATGC", generic_dna) record = SeqRecord(sequence_object, id='123456789', name='UnitTest', description='Test case for date parsing') record.annotations["date"] = ["24-DEC-2015"] handle = StringIO() SeqIO.write(record, handle, 'genbank') handle.seek(0) gb = SeqIO.read(handle, "gb") self.assertEqual(gb.annotations["date"], "24-DEC-2015") record = SeqRecord(sequence_object, id='123456789', name='UnitTest', description='Test case for date parsing') record.annotations["date"] = ["24-DEC-2015", "25-JAN-2016"] handle = StringIO() SeqIO.write(record, handle, 'genbank') handle.seek(0) gb = SeqIO.read(handle, "gb") self.assertEqual(gb.annotations["date"], "01-JAN-1980")
def process(filename,compopt={'compdeep':3,'compstep':3,'comptype':[0,1,2]},oligox=[1,2,3]): """ compopt={'compdeep':compdeep,'compstep':compstep,'comptype':comptype}; comptype=[posi,poj...] """ if filename.rpartition('.gb')[0]!='': path=filename.rpartition('/')[0]+'/'+filename.rpartition('/')[2].rpartition('.gb')[0]+'_semantix' reppath=path+'/report' inputfile=reppath+'/'+filename.rpartition('/')[2].rpartition('.gb')[0]+'.gb' elif filename.rpartition('.fas')[0]!='': path=filename.rpartition('/')[0]+'/'+filename.rpartition('/')[2].rpartition('.fas')[0]+'_semantix' reppath=path+'/report' inputfile=reppath+'/'+filename.rpartition('/')[2].rpartition('.fas')[0]+'.fas' inputfile=xfname(inputfile,compopt['compstep'],0,'') try: os.mkdir(path) except: pass try: os.mkdir(reppath) except: pass ## print inputfile.get_name() ## print inputfile.get_name().rpartition('/')[2] linklist=[] if inputfile.get_name().rpartition('/')[2] not in os.listdir(reppath): shutil.copy2(filename,inputfile.get_name()) if inputfile.ext=='.gb': ## print inputfile.get_name() data=SeqIO.read(inputfile.get_name(),"genbank").seq else: data=SeqIO.read(inputfile.get_name(),"fasta").seq ## print oligox form_linklist(linklist,inputfile) for o in oligox: ## print o Tez,Tez_rev=get_Tez(data,o) ramka,signs=get_matrix(Tez,Tez_rev) infotab=[['oligonucleotide:',str(o)+'-plet'],['compress level:',inputfile.get_level()]] html_write(infotab,ramka,o,inputfile,linklist) for cd in xrange(1,compopt['compdeep']+1): poslist=[] get_pos_on_level(poslist,'',cd,compopt) for pos in poslist: data=compress(inputfile,cd,pos,compopt) form_linklist(linklist,inputfile) for o in oligox: ## print o Tez,Tez_rev=get_Tez(data,o) ramka,signs=get_matrix(Tez,Tez_rev) infotab=[['oligonucleotide:',str(o)+'-plet'],['compress level:',inputfile.get_level()]] html_write(infotab,ramka,o,inputfile,linklist) form_report(path,linklist,oligox,compopt)
def test_genbank_to_fasta(self): """Conversion of GenBank to FASTA.""" filename = "GenBank/NC_005816.gb" old = SeqIO.read(filename, "gb") with open(filename) as handle: new = SeqIO.read(TogoWS.convert(handle, "genbank", "fasta"), "fasta") self.assertEqual(str(old.seq), str(new.seq))
def __init__(self, fname = 'data/H3N2_gisaid_epiflu_sequence.fasta', out_specs={'data_dir':'data/', 'prefix':'H3N2_', 'qualifier':''}, **kwargs): super(flu_process, self).__init__() self.fname = fname self.kwargs = kwargs self.out_specs = out_specs if 'outgroup' in kwargs: outgroup_file = kwargs['outgroup'] else: outgroup_file = 'source_data/'+out_specs['prefix']+'outgroup.gb' tmp_outgroup = SeqIO.read(outgroup_file, 'genbank') self.outgroup = tmp_outgroup.features[0].qualifiers['strain'][0] genome_annotation = tmp_outgroup.features ref_seq = SeqIO.read(outgroup_file, 'genbank') self.proteins = {f.qualifiers['gene'][0]:FeatureLocation(start=f.location.start, end=f.location.end, strand=1) for f in ref_seq.features if 'gene' in f.qualifiers and f.qualifiers['gene'][0] in ['SigPep', 'HA1', 'HA2']} self.time_interval = [datetime.strptime('2008-01-01', "%Y-%m-%d").date(), datetime.strptime('2016-01-01', "%Y-%m-%d").date()] self.frequencies = defaultdict(dict) self.pivots = np.linspace(num_date(self.time_interval[0]), num_date(self.time_interval[1]),40) self.seqs = sequence_set(self.fname, reference=self.outgroup) self.seqs.ungap() self.seqs.parse({0:'strain', 1:'isolate_id', 3:'passage', 5:'date', 7:'lab', 8:"accession"}, strip='_') self.fix_strain_names() self.seqs.raw_seqs[self.outgroup].seq=tmp_outgroup.seq self.seqs.raw_seqs['A/Beijing/32/1992'].attributes['date']='1992-01-01' self.seqs.reference = self.seqs.raw_seqs[self.outgroup] self.seqs.parse_date(["%Y-%m-%d"], prune=True) self.geo_parse() self.filenames()
def get_genome_file(self,especie=0,idn=0):#cria o ficheiro e da o nome a funcao anterior para retirar a sequencia do ficheiro return_filename="" if idn==0: hand=Entrez.esearch(db='nucleotide',term=especie+"[ORGN]",retmax=100,retype="gb",retmode="text") results=Entrez.read(hand) idnum=results["IdList"][0]#primeiro elemento da lista de resultados print idnum handle=Entrez.efetch(db='nucleotide',rettype="fasta",retmode="text",id=idnum) # record=Entrez.read(handle) #print(record[0].keys()) #print (record[0]["TSeq_defline"]) #print handle.read() read=SeqIO.read(handle,"fasta") name="genome_"+str(especie).strip(" ")+".fasta" SeqIO.write(read,name, "fasta") handle.close() return_filename+=name ### # record = SeqIO.read("genome_escherichia coli.fasta", "fasta") ### # print record # filename = "genome_"+especie+".gb" elif especie==0: handle=Entrez.efetch(db='nucleotide',rettype="fasta",retmode="text",id=idn) read=SeqIO.read(handle,"fasta") name="genome_"+str(idn).strip(" ")+".fasta" SeqIO.write(read,name,"fasta") handle.close() return_filename+=name #print return_filename return return_filename
def get_raw_check(self, filename, format, alphabet): handle = open(filename, "rb") raw_file = handle.read() handle.close() #Also checking the key_function here id_list = [rec.id.lower() for rec in \ SeqIO.parse(filename, format, alphabet)] rec_dict = SeqIO.index(filename, format, alphabet, key_function = lambda x : x.lower()) self.assertEqual(set(id_list), set(rec_dict.keys())) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertTrue(key in rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(raw.strip()) self.assertTrue(raw in raw_file) rec1 = rec_dict[key] #Following isn't very elegant, but it lets me test the #__getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(_as_bytes("<entry "))) self.assertTrue(raw.endswith(_as_bytes("</entry>"))) #Currently the __getitem__ method uses this #trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict._proxy._handle.close() #TODO - Better solution del rec_dict
def genebank_extract_exon(p_genbank, p_genome = None, p_output = None): ''' Extract exons from genbank file :param: p_genbank path to the genbank file :param: p_output path to the fasta output file containing tRNA and rRNA :return: ''' if p_output == None: p_output = os.path.basename(p_genbank)+'.exon' genome=SeqIO.read(p_genbank,'genbank') if p_genome is None: full_seq = genome.seq else: genome_fasta = SeqIO.read(p_genome, 'fasta') full_seq = genome_fasta.seq fasta_format = '>{type}|{genome}|position={start}-{stop}:{strand}|locus={locus}|gene={gene}|product={product}\n{seq}\n' fout = open(p_output, 'w') nb_sequence = 0 for gene in genome.features: if gene.type in ['CDS']: d_info = {'type' : gene.type, 'genome':genome.id} d_info['seq'] = gene.extract(full_seq) d_info['start'] = gene.location.start.position d_info['stop'] = gene.location.end.position d_info['strand'] = gene.location.strand # some gene, like pseudo gene, transposon do not have product # We do not take pseudogene if 'product' not in gene.qualifiers: if 'pseudogene' not in ''.join(gene.qualifiers['note']): d_info['product'] = '' else: continue # we do not take pseudogene else: d_info['product'] = ','.join(gene.qualifiers['product']) d_info['gene'] = ','.join(gene.qualifiers['gene']) d_info['locus'] = ','.join(gene.qualifiers['locus_tag']) fout.write(fasta_format.format(**d_info)) nb_sequence += 1 fout.close() print('%d exons have been extracted in %s'%(nb_sequence ,p_output)) return None
def load_HXB2(cropped=False, fragment=None, trim_primers=False): '''Load HXB2 reference sequence''' if fragment is None: return SeqIO.read(get_HXB2_entire(cropped=cropped), 'fasta') else: return SeqIO.read(get_HXB2_fragmented(fragment, trim_primers=trim_primers), 'fasta')
def load_NL43(fragment=None, trim_primers=False): '''Load NL4-3 reference sequence''' if fragment is None: return SeqIO.read(get_NL43_entire(), 'fasta') else: return SeqIO.read(get_NL43_fragmented(fragment, trim_primers=trim_primers), 'fasta')
def load_F10(fragment=None): '''Load F10 reference sequence''' if fragment is None: return SeqIO.read(get_F10_entire(), 'fasta') else: return SeqIO.read(get_F10_fragmented(fragment, trim_primers=trim_primers), 'fasta')
def testOrderIndependent(self): """ A sequence that is a prefix of another is removed when it appears first. """ s1 = SeqIO.read(StringIO('>s1\nagtcag'), 'fasta') s2 = SeqIO.read(StringIO('>s2\nagtcagtcagtc'), 'fasta') self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2])), [s2])
def testRemovalOfIdenticalSequences(self): """ A list with 2 copies of the same seq is de-duped to have 1 copy. """ seq = '>hey\nagtcagtcagtc' s1 = SeqIO.read(StringIO(seq), 'fasta') s2 = SeqIO.read(StringIO(seq), 'fasta') self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2])), [s1])
def test_view(self): client = Client() url = reverse('multipartite_view_free') response = client.get(url) assert "pDGB2_alpha1R" in str(response) url = reverse('multipartite_view_free', kwargs={'form_num': '1'}) response = client.post(url, {'vector': 'pDGB2_alpha1R', 'part_1': 'pP2A11'}) assert "An11" in str(response) url = reverse('multipartite_view_free', kwargs={'form_num': '2'}) response = client.post(url, {'vector': 'pDGB2_alpha1R', 'part_1': 'pP2A11', 'part_2': 'pLuciferas'}) assert 'feature does not exist' in str(response) response = client.post(url, {'vector': 'pDGB2_alpha1R', 'part_1': 'pP2A11', 'part_2': 'pLuciferase'}) assert "pT35S" in str(response) response = client.post(url, {'vector': 'pDGB2_alpha1R', 'part_1': 'pP2A11', 'part_2': 'pLuciferase', 'part_3': 'pT35S'}) assert "<p>You have assembled in the GoldenBraid" in str(response) # reverse vector url = reverse('multipartite_view_free_genbank') response = client.post(url, {'part_1': 'pP2A11', 'part_2': 'pMYB12', 'part_3': 'pTerm2A11', 'vector': 'pDGB1_alpha1R'}) assert response.status_code == 200 seqrec1 = SeqIO.read(StringIO(str(response)), 'gb') assert seqrec1.name == 'GB_UA_E' multipartite_free_seq1 = str(seqrec1.seq) gb_path = os.path.join(TEST_DATA, 'pEGBMybrev_uniq.gb') seqrec2 = SeqIO.read(gb_path, 'gb') multipartite_free_seq2 = str(seqrec2.seq)[4:] multipartite_free_seq2 += str(seqrec2.seq)[:4] assert multipartite_free_seq1 == multipartite_free_seq2 # with more than one part of the same type url = reverse('multipartite_view_free', kwargs={'form_num': '5'}) response = client.post(url, {'part_1': 'pP2A11', 'part_2': 'GB0365', 'part_3': 'GB0653', 'part_4': 'GB0655', 'part_5': 'pT35S', 'vector': 'pDGB1_alpha1'}) assert "<p>Other.2:<a href='/feature/GB0655'>GB0655</a></p>" in str(response)
def __init__(self, seq_id=None, seq_type=None): "sets variables for instance" if seq_type is 'uniprot': handle = ExPASy.get_sprot_raw(seq_id) self.seq_record = SeqIO.read(handle, "swiss") elif seq_type is 'genbank': handle = Entrez.efetch(db='protein', rettype='genbank', id=seq_id) self.seq_record = SeqIO.read(handle, "genbank") handle.close()
def test_001_negative_location_warning(self): with warnings.catch_warnings(): warnings.simplefilter("error", BiopythonParserWarning) try: SeqIO.read(path.join("GenBank", "negative_location.gb"), "genbank") except BiopythonParserWarning as e: self.assertEqual(str(e), "Couldn't parse feature location: '-2..492'") else: self.assertTrue(False, "Expected specified BiopythonParserWarning here.")
def read_sequences(self, source, destination): """ Reads in the source and destination FASTA files. """ self.src_nt = SeqIO.read(source, 'fasta', alphabet=generic_dna) self.des_aa = SeqIO.read(destination, 'fasta', alphabet=generic_protein)
def testRemovalOfPrefixSuffixAndDuplicate(self): """ Prefixes, suffixes, and duplicates should collectively all be removed. """ s1 = SeqIO.read(StringIO('>s1\nagtcagtcagtc'), 'fasta') s2 = SeqIO.read(StringIO('>s2\nagtcagtcagtc'), 'fasta') s3 = SeqIO.read(StringIO('>s3\nagtcagt'), 'fasta') s4 = SeqIO.read(StringIO('>s4\ntcagtc'), 'fasta') self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2, s3, s4])), [s1])
def test_Genome(self): #"""Checking GenBank sequence vs FASTA fna file.""" gb_record = SeqIO.read(open(self.gb_filename),"genbank") fa_record = SeqIO.read(open(self.fna_filename),"fasta") compare_record(gb_record, fa_record) if self.emblname is None: return embl_record = SeqIO.read(open(self.embl_filename),"embl") compare_record(gb_record, embl_record, expect_minor_diffs=True)
def draw_pairwise(args, dwg, n1, n2, pos1, pos2, y_start): g = dwg.g() x_start = args.margin fasta1 = path.join(args.patser_directory, n1 + '.fasta') fasta2 = path.join(args.patser_directory, n2+'.fasta') if path.exists(fasta1) and path.exists(fasta2): seq1 = SeqIO.read(fasta1, 'fasta').seq seq2 = SeqIO.read(fasta2, 'fasta').seq elif args.fasta: if n1 in args.fasta and n2 in args.fasta: seq1 = args.fasta[n1].seq seq2 = args.fasta[n2].seq else: print("Can't find both sequences: {} and {}".format(n1, n2)) seq1 = defaultdict(lambda : 'N') seq2 = defaultdict(lambda : 'N') else: seq1 = defaultdict(lambda : 'N') seq2 = defaultdict(lambda : 'N') lines.append(dwg.line( (x_start, y_start), (x_start + args.x_scale * len(pos1), y_start))) indels = diff(pos2) - diff(pos1) id_start = 0 # Draw Indels for val, group in it.groupby(indels): for i, _ in enumerate(group): pass i += 1 g.add(dwg.rect( (x_start + args.x_scale * id_start, y_start - .1 * args.y_sep * (val < 0)), (args.x_scale * i, .1 * args.y_sep * (val != 0)), fill="grey" )) # SNPs if val == 0: for j in range(id_start, id_start + i): if str(seq1[pos1[j]]) != str(seq2[pos2[j]]): g.add(dwg.line( (x_start + args.x_scale * j, y_start - .3 * args.y_sep), (x_start + args.x_scale * j, y_start), style="stroke-width:1; stroke:{};".format(seq_colors[str(seq1[pos1[j]])]), )) g.add(dwg.line( (x_start + args.x_scale * j, y_start), (x_start + args.x_scale * j, y_start + .3 * args.y_sep), id='{}:{}--{}>{}'.format(pos1[j], pos2[j], seq1[pos1[j]], seq2[pos2[j]],), style="stroke-width:1; stroke:{};".format(seq_colors[str(seq2[pos2[j]])]), )) id_start += i y_start += 0.5 * delta_y dwg.add(g) return y_start
type=str, required=True, help="reference sequence") parser.add_argument("--metadata", type=str, required=True, help="metadata") parser.add_argument("--focal-alignment", type=str, required=True, help="focal smaple of sequences") parser.add_argument("--output", type=str, required=True, help="FASTA file of output alignment") args = parser.parse_args() # load entire alignment and the alignment of focal sequences (upper case -- probably not necessary) ref = sequence_to_int_array(SeqIO.read(args.reference, 'genbank').seq) context_seqs_dict = calculate_snp_matrix(args.alignment, consensus=ref) focal_seqs_dict = calculate_snp_matrix(args.focal_alignment, consensus=ref) alignment_length = len(ref) print("Done reading the alignments.") # calculate number of masked sites in either set mask_count_focal = np.array( [len(x) for x in focal_seqs_dict['filled_positions']]) mask_count_context = { s: len(x) for s, x in zip(context_seqs_dict['names'], context_seqs_dict['filled_positions']) } # for each context sequence, calculate minimal distance to focal set, weigh with number of N/- to pick best sequence
#!/usr/bin/env python import argparse from Bio import SeqIO, Restriction from Bio.Alphabet import IUPAC parser = argparse.ArgumentParser( description= 'record MseI sites for given chromosome FASTA in bed file format') parser.add_argument('-f', help='chromsome FASTA file (ex. chr21.fa)', type=str, dest='f', required=True) args = parser.parse_args() seq_record = SeqIO.read(args.f, "fasta", IUPAC.ambiguous_dna) coords = Restriction.MseI.search(seq_record.seq) chrom = seq_record.id OUT = open('MseI_sites_' + chrom + '.bed', 'w') for start in coords: # Note: compensate for search function finding first base after the # position the enzyme will cut. OUT.write('\t'.join([chrom, str(start - 2), str((start - 2) + 4)]) + '\n') OUT.close()
from __future__ import print_function try: from cStringIO import StringIO except ImportError: from io import StringIO from Bio import SeqIO import requests """ Get all families for human """ url = "https://dfam.org/api/families" params = { "format": "summary", "clade": "9606", "clade_relatives": "both", } response = requests.get(url, params=params) results = response.json()["results"] records = [] for r in results: if r['repeat_type_name'] == 'LTR': nurl = url + '/' + r['accession'] + '/sequence' response2 = requests.get(nurl, params={'format': 'embl'}) rec = SeqIO.read(StringIO(response2.text.encode('ascii', 'ignore')), 'embl') records.append(rec) SeqIO.write(records, 'ERV_human.dfam.gb', 'genbank') SeqIO.write(records, 'ERV_human.dfam.fasta', 'fasta')
#print removals for removal in genomeRemoved: print("Removed: " + removal + ' --incomplete genome') print() for removal in taxonRemoved: print("Removed: " + removal + ' --not a virus') outf = open(outputFile, 'w') for refID in blastInfos.keys(): flag = True while flag: try: handle = Entrez.efetch(db="nucleotide", id=refID, rettype="gb", retmode="text") record = SeqIO.read(handle, "genbank") handle.close() seq = record.seq try: name = record.description except: name = 'name-not-found' print('nnn') outf.write('>' + refID + '\n') outf.write(str(seq) + '\n') flag = False except: time.sleep(10) print('sequences written to: ' + outputFile) os.remove(flagfileName)
def load_default_plastid(): return SeqIO.read("Plastids/Arabidopsis_thaliana.gb", 'gb')
#!/usr/bin/env python # http://biopython.org/DIST/docs/tutorial/Tutorial.html # 20.1.13. Identifying open reading frames # https://biopython.readthedocs.io/en/latest/Tutorial/chapter_cookbook.html from Bio import SeqIO record = SeqIO.read("NC_005816.fna", "fasta") table = 11 min_pro_len = 100 for strand, nuc in [(+1, record.seq), (-1, record.seq.reverse_complement())]: for frame in range(3): length = 3 * ((len(record) - frame) // 3) #Multiple of three for pro in nuc[frame:frame + length].translate(table).split("*"): if len(pro) >= min_pro_len: print("%s...%s - length %i, strand %i, frame %i" \ % (pro[:30], pro[-3:], len(pro), strand, frame))
#! /usr/bin/python from sys import argv from Bio import pairwise2 # uses Biopython wrapper for pairwise alignment from Bio import SeqIO # to parse the sequences seq1 = SeqIO.read(argv[1], 'fasta') seq2 = SeqIO.read(argv[2], 'fasta') alignments = pairwise2.align.globalxx(seq1.seq, seq2.seq) # xx represents two character code to determine first the match score and then the cost for gaps print(pairwise2.format_alignment(*alignments[0]))
from Bio import SeqIO record = SeqIO.read('sequenceBST2.gb', 'genbank') #Informação sobre a seqência print(record.id, '\n') print(record.seq, '\n') print(record.description, '\n') print(record.name, '\n') print(len(record.seq), '\n') print(record.dbxrefs, '\n') print(record.annotations["source"], '\n') #anotações, features e qualifiers for k, v in record.annotations.items(): print(k, v) print('\n', len(record.features), '\n') for i in record.features: print(i) print('\n', record.features, '\n')
def seq_record_loaded_from_file_example(fasta_path): """Original SeqRecord loaded from sequence file""" return SeqIO.read(fasta_path, "fasta")
def test_uni001(self): "Parsing Uniprot file uni001" filename = 'uni001' # test the record parser datafile = os.path.join('SwissProt', filename) with open(datafile) as test_handle: seq_record = SeqIO.read(test_handle, "uniprot-xml") self.assertTrue(isinstance(seq_record, SeqRecord)) # test a couple of things on the record -- this is not exhaustive self.assertEqual(seq_record.id, "Q91G55") self.assertEqual(seq_record.name, "043L_IIV6") self.assertEqual(seq_record.description, "Uncharacterized protein 043L") self.assertEqual( repr(seq_record.seq), "Seq('MDLINNKLNIEIQKFCLDLEKKYNINYNNLIDLWFNKESTERLIKCEVNLENKI...IPI', ProteinAlphabet())" ) # self.assertEqual(seq_record.accessions, ['Q91G55']) #seq_record.accessions does not exist # self.assertEqual(seq_record.organism_classification, ['Eukaryota', 'Metazoa', 'Chordata', 'Craniata', 'Vertebrata', 'Mammalia', 'Eutheria', 'Primates', 'Catarrhini', 'Hominidae', 'H**o']) # self.assertEqual(record.seqinfo, (348, 39676, '75818910')) self.assertEqual(len(seq_record.features), 1) self.assertEqual( repr(seq_record.features[0]), "SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(116)), type='chain', id='PRO_0000377969')" ) self.assertEqual(len(seq_record.annotations['references']), 2) self.assertEqual(seq_record.annotations['references'][0].authors, 'Jakob N.J., Mueller K., Bahr U., Darai G.') self.assertEqual( seq_record.annotations['references'][0].title, 'Analysis of the first complete DNA sequence of an invertebrate iridovirus: coding strategy of the genome of Chilo iridescent virus.' ) self.assertEqual(seq_record.annotations['references'][0].journal, 'Virology 286:182-196(2001)') self.assertEqual( seq_record.annotations['references'][0].comment, 'journal article | 2001 | Scope: NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA] | ' ) self.assertEqual(len(seq_record.dbxrefs), 11) self.assertEqual(seq_record.dbxrefs[0], 'DOI:10.1006/viro.2001.0963') self.assertEqual(seq_record.annotations['sequence_length'], 116) self.assertEqual(seq_record.annotations['sequence_checksum'], '4A29B35FB716523C') self.assertEqual(seq_record.annotations['modified'], '2009-07-07') self.assertEqual(seq_record.annotations['accessions'], ['Q91G55']) self.assertEqual(seq_record.annotations['taxonomy'], [ 'Viruses', 'dsDNA viruses, no RNA stage', 'Iridoviridae', 'Iridovirus' ]) self.assertEqual(seq_record.annotations['sequence_mass'], 13673) self.assertEqual(seq_record.annotations['dataset'], 'Swiss-Prot') self.assertEqual(seq_record.annotations['gene_name_ORF'], ['IIV6-043L']) self.assertEqual(seq_record.annotations['version'], 21) self.assertEqual(seq_record.annotations['sequence_modified'], '2001-12-01') self.assertEqual(seq_record.annotations['keywords'], ['Complete proteome', 'Virus reference strain']) self.assertEqual(seq_record.annotations['organism_host'], [ 'Acheta domesticus', 'House cricket', 'Chilo suppressalis', 'striped riceborer', 'Gryllus bimaculatus', 'Two-spotted cricket', 'Gryllus campestris', 'Spodoptera frugiperda', 'Fall armyworm' ]) self.assertEqual(seq_record.annotations['created'], '2009-06-16') self.assertEqual(seq_record.annotations['organism_name'], ['Chilo iridescent virus']) self.assertEqual(seq_record.annotations['organism'], 'Invertebrate iridescent virus 6 (IIV-6)') self.assertEqual(seq_record.annotations['recommendedName_fullName'], ['Uncharacterized protein 043L']) self.assertEqual(seq_record.annotations['sequence_version'], 1) self.assertEqual(seq_record.annotations['proteinExistence'], ['Predicted'])
def test_uni003(self): "Parsing Uniprot file uni003" filename = 'uni003' # test the record parser datafile = os.path.join('SwissProt', filename) test_handle = open(datafile) seq_record = SeqIO.read(test_handle, "uniprot-xml") test_handle.close() self.assertTrue(isinstance(seq_record, SeqRecord)) # test general record entries self.assertEqual(seq_record.id, "O44185") self.assertEqual(seq_record.name, "FLP13_CAEEL") self.assertEqual(seq_record.description, "FMRFamide-like neuropeptides 13") self.assertEqual( repr(seq_record.seq), "Seq('MMTSLLTISMFVVAIQAFDSSEIRMLDEQYDTKNPFFQFLENSKRSDRPTRAMD...GRK', ProteinAlphabet())" ) self.assertEqual(len(seq_record.annotations['references']), 7) self.assertEqual(seq_record.annotations['references'][5].authors, 'Kim K., Li C.') self.assertEqual( seq_record.annotations['references'][5].title, 'Expression and regulation of an FMRFamide-related ' 'neuropeptide gene family in Caenorhabditis elegans.') self.assertEqual(seq_record.annotations['references'][5].journal, 'J. Comp. Neurol. 475:540-550(2004)') self.assertEqual( seq_record.annotations['references'][5].comment, 'journal article | 2004 | Scope: TISSUE SPECIFICITY, ' 'DEVELOPMENTAL STAGE | ') self.assertEqual(seq_record.annotations["accessions"], ['O44185']) self.assertEqual(seq_record.annotations["created"], "2004-05-10") self.assertEqual(seq_record.annotations["dataset"], "Swiss-Prot") self.assertEqual(seq_record.annotations["gene_name_ORF"], ['F33D4.3']) self.assertEqual(seq_record.annotations["gene_name_primary"], "flp-13") self.assertEqual(seq_record.annotations["keywords"], [ 'Amidation', 'Cleavage on pair of basic residues', 'Complete proteome', 'Direct protein sequencing', 'Neuropeptide', 'Reference proteome', 'Repeat', 'Secreted', 'Signal' ]) self.assertEqual(seq_record.annotations["modified"], "2012-11-28") self.assertEqual(seq_record.annotations["organism"], "Caenorhabditis elegans") self.assertEqual(seq_record.annotations["proteinExistence"], ['evidence at protein level']) self.assertEqual(seq_record.annotations["recommendedName_fullName"], ['FMRFamide-like neuropeptides 13']) self.assertEqual(seq_record.annotations["sequence_length"], 160) self.assertEqual(seq_record.annotations["sequence_checksum"], "BE4C24E9B85FCD11") self.assertEqual(seq_record.annotations["sequence_mass"], 17736) self.assertEqual(seq_record.annotations["sequence_modified"], "1998-06-01") self.assertEqual(seq_record.annotations["sequence_precursor"], "true") self.assertEqual(seq_record.annotations["sequence_version"], 1) self.assertEqual(seq_record.annotations["taxonomy"], [ 'Eukaryota', 'Metazoa', 'Ecdysozoa', 'Nematoda', 'Chromadorea', 'Rhabditida', 'Rhabditoidea', 'Rhabditidae', 'Peloderinae', 'Caenorhabditis' ]) self.assertEqual(seq_record.annotations["type"], ['ECO:0000006', 'ECO:0000001']) self.assertEqual(seq_record.annotations["version"], 74) # test comment entries self.assertEqual(seq_record.annotations["comment_allergen"], ['Causes an allergic reaction in human.']) self.assertEqual( seq_record.annotations["comment_alternativeproducts_isoform"], ['Q8W1X2-1', 'Q8W1X2-2']) self.assertEqual(seq_record.annotations["comment_biotechnology"], [ 'Green fluorescent protein has been engineered to produce a ' 'vast number of variously colored mutants, fusion proteins, ' 'and biosensors. Fluorescent proteins and its mutated allelic ' 'forms, blue, cyan and yellow have become a useful and ' 'ubiquitous tool for making chimeric proteins, where they ' 'function as a fluorescent protein tag. Typically they ' 'tolerate N- and C-terminal fusion to a broad variety of ' 'proteins. They have been expressed in most known cell types ' 'and are used as a noninvasive fluorescent marker in living ' 'cells and organisms. They enable a wide range of applications ' 'where they have functioned as a cell lineage tracer, reporter ' 'of gene expression, or as a measure of protein-protein ' 'interactions.', 'Can also be used as a molecular thermometer, ' 'allowing accurate temperature measurements in fluids. The ' 'measurement process relies on the detection of the blinking ' 'of GFP using fluorescence correlation spectroscopy.' ]) self.assertEqual(seq_record.annotations["comment_catalyticactivity"], [ 'ATP + acetyl-CoA + HCO(3)(-) = ADP + phosphate + malonyl-CoA.', 'ATP + biotin-[carboxyl-carrier-protein] + CO(2) = ADP + ' 'phosphate + carboxy-biotin-[carboxyl-carrier-protein].' ]) self.assertEqual(seq_record.annotations["comment_caution"], [ 'Could be the product of a pseudogene. The existence of a ' 'transcript at this locus is supported by only one sequence ' 'submission (PubMed:2174397).' ]) self.assertEqual(seq_record.annotations["comment_cofactor"], [ 'Biotin (By similarity).', 'Binds 2 manganese ions per ' 'subunit (By similarity).' ]) self.assertEqual( seq_record.annotations["comment_developmentalstage"], [ 'Expressed from the comma stage of embryogenesis, during all ' 'larval stages, and in low levels in adults.' ]) self.assertEqual(seq_record.annotations["comment_disease"], [ 'Defects in MC2R are the cause of glucocorticoid deficiency ' 'type 1 (GCCD1) [MIM:202200]; also known as familial ' 'glucocorticoid deficiency type 1 (FGD1). GCCD1 is an ' 'autosomal recessive disorder due to congenital ' 'insensitivity or resistance to adrenocorticotropin (ACTH). ' 'It is characterized by progressive primary adrenal ' 'insufficiency, without mineralocorticoid deficiency.' ]) self.assertEqual( seq_record.annotations["comment_disruptionphenotype"], [ 'Mice display impaired B-cell development which does not ' 'progress pass the progenitor stage.' ]) self.assertEqual(seq_record.annotations["comment_domain"], [ 'Two regions, an N-terminal (aa 96-107) and a C-terminal ' '(aa 274-311) are required for binding FGF2.' ]) self.assertEqual(seq_record.annotations["comment_enzymeregulation"], [ 'By phosphorylation. The catalytic activity is inhibited by ' 'soraphen A, a polyketide isolated from the myxobacterium ' 'Sorangium cellulosum and a potent inhibitor of fungal growth.' ]) self.assertEqual(seq_record.annotations["comment_function"], [ 'FMRFamides and FMRFamide-like peptides are neuropeptides. ' 'AADGAPLIRF-amide and APEASPFIRF-amide inhibit muscle tension ' 'in somatic muscle. APEASPFIRF-amide is a potent inhibitor of ' 'the activity of dissected pharyngeal myogenic muscle system.' ]) self.assertEqual(seq_record.annotations["comment_induction"], [ 'Repressed in presence of fatty acids. Repressed 3-fold by ' 'lipid precursors, inositol and choline, and also controlled ' 'by regulatory factors INO2, INO4 and OPI1.' ]) self.assertEqual( seq_record.annotations["comment_interaction_intactId"], ['EBI-356720', 'EBI-746969', 'EBI-720116']) self.assertEqual(seq_record.annotations["comment_massspectrometry"], ['88..98:1032|MALDI', '100..110:1133.7|MALDI']) self.assertEqual( seq_record.annotations["comment_miscellaneous"], ['Present with 20200 molecules/cell in log phase SD medium.']) self.assertEqual( seq_record.annotations["comment_onlineinformation"], ['NIEHS-SNPs@http://egp.gs.washington.edu/data/api5/']) self.assertEqual(seq_record.annotations["comment_pathway"], [ 'Lipid metabolism; malonyl-CoA biosynthesis; malonyl-CoA ' 'from acetyl-CoA: step 1/1.' ]) self.assertEqual(seq_record.annotations["comment_RNAediting"], [ 'Partially edited. RNA editing generates receptor isoforms ' 'that differ in their ability to interact with the ' 'phospholipase C signaling cascade in a transfected cell ' 'line, suggesting that this RNA processing event may ' 'contribute to the modulation of serotonergic ' 'neurotransmission in the central nervous system.' ]) self.assertEqual( seq_record.annotations["comment_PTM"], ['Acetylation at Lys-251 impairs antiapoptotic function.']) self.assertEqual(seq_record.annotations["comment_pharmaceutical"], [ 'Could be used as a possible therapeutic agent for treating ' 'rheumatoid arthritis.' ]) self.assertEqual(seq_record.annotations["comment_polymorphism"], [ 'Position 23 is polymorphic; the frequencies in unrelated ' 'Caucasians are 0.87 for Cys and 0.13 for Ser.' ]) self.assertEqual( seq_record.annotations["comment_similarity"], ['Belongs to the FARP (FMRFamide related peptide) family.']) self.assertEqual( seq_record.annotations["comment_subcellularlocation_location"], ['Secreted']) self.assertEqual(seq_record.annotations["comment_subunit"], ['Homodimer.']) self.assertEqual(seq_record.annotations["comment_tissuespecificity"], [ 'Each flp gene is expressed in a distinct set of neurons. ' 'Flp-13 is expressed in the ASE sensory neurons, the DD motor ' 'neurons, the 15, M3 and M5 cholinergic pharyngeal ' 'motoneurons, and the ASG, ASK and BAG neurons.' ]) self.assertEqual(seq_record.annotations["comment_toxicdose"], [ 'LD(50) is 50 ug/kg in mouse by intracerebroventricular ' 'injection and 600 ng/g in Blatella germanica.' ])
def test_Q13639(self): """Compare SwissProt text and uniprot XML versions of Q13639.""" old = SeqIO.read("SwissProt/Q13639.txt", "swiss") new = SeqIO.read("SwissProt/Q13639.xml", "uniprot-xml") self.compare_txt_xml(old, new)
def main(): """The main function """ try: parser = cmdline_parser() (opts, args) = parser.parse_args() print opts, args if len(args): parser.error("Unrecognized arguments found: %s." % (' '.join(args))) sys.exit(1) except: parser.print_help() sys.exit(0) Entrez.email = opts.Entrez_email threads = str(3) tab_fmt = "'6 qseqid sseqid qstart qend sstart send pident qcovs evalue bitscore stitle'" print opts.f1, args # Read the summary file: info_dict = {} with open(opts.csv_file, 'r') as csvfile: reader = csv.reader(csvfile) for xi in reader: print xi info_dict[xi[-2]] = xi print info_dict.keys() os.chdir(opts.fq_dir) ref = opts.ref_dir + "/" + info_dict[opts.f1.replace("-Qc.fastq", "")][-1] + ".fa" if not os.path.exists(opts.out_dir): os.mkdir(opts.out_dir) res_dir = opts.out_dir + "/" + opts.f1.replace("-Qc.fastq", "") if not os.path.exists(res_dir): os.mkdir(res_dir) flag = 0 for x in SeqIO.parse(open(ref, "r"), "fasta"): split_ref = res_dir + "/" + x.id.replace("_", "").replace( "|", "").replace("/", "") + ".ref" with open(split_ref, "w") as ff: ff.write(">" + x.id + "\r\n" + str(x.seq) + "\r\n") # Map reads to a reference genome... cons_ref = res_dir + "/" + opts.f1.replace( "-Qc.fastq", x.id.replace("_", "").replace("|", "").replace("/", "") + ".fa") cons_cmd = "bam2cons_iter-lenient.sh -f " + opts.f1 + " -r " + split_ref + " -t " + threads + " --force -o " + cons_ref print cons_cmd if not os.path.isfile(cons_ref): os.system(cons_cmd) cons = SeqIO.parse(cons_ref, "fasta") nee = [xl for xl in cons] if len(nee) != 0: X = [] for xl in nee: nnn = str(xl.seq).replace("N", "") if len(nnn) > 30: X.append(nnn) else: X = [] print "NEE", nee, X if len(nee) != 0 and len(X) != 0: # Blast the consensus formed.... b_out = res_dir + "/" + opts.f1.replace(".fastq", ".tmp") blast_run_uni = NcbiblastnCommandline(cmd=opts.blast_n, task="megablast", db=opts.db, max_target_seqs=1, query=cons_ref, outfmt=tab_fmt, out=b_out) print "Runnin blast...", blast_run_uni nearest_ref1 = cons_ref.replace(".fa", "-nref1.fa") if not os.path.isfile(nearest_ref1): stdout, stderr = blast_run_uni() c = 0 for line in open(b_out, "r"): c = c + 1 if c == 1: u_id = line.split("|") custom = line.split("\t") print "u_id", u_id, custom if opts.database_type == "Custom": cmd_cus = "blastdbcmd -entry '" + custom[ 1] + "' -db " + opts.db + " > " + nearest_ref1 print cmd_cus os.system(cmd_cus) elif opts.database_type == "NCBI": handle = Entrez.efetch( db="nuccore", id=u_id[3], rettype="gb", retmode="text", idtype="acc") # Use the PrimaryID instead of GI #handle = Entrez.efetch(db="nucleotide", id="AY851612", rettype="gb", retmode="xml") #handle = "elink -db nuccore -query " + u_id[3] + " |efetch -format gb " #print handle record = SeqIO.read(handle, 'genbank') #record = Entrez.read(handle) #, validate=False) #print record with open(nearest_ref1, "w") as f: #f.write(">" + record[0]["GBSeq_primary-accession"] + "\r\n" + record[0]["GBSeq_sequence"] + "\r\n") f.write(">" + record.id + "\r\n" + str(record.seq) + "\r\n") # 2-iteration Map reads to nearest reference... # IonXpress_012-Qc.fastq ddd = opts.f1.replace("-Qc.fastq", "") cons_ref2 = res_dir + "/" + info_dict[ddd][ 0] + "-" + x.id + "cons2.fa" cons_cmd2 = "bam2cons_iter-lenient.sh -f " + opts.f1 + " -r " + nearest_ref1 + " -t " + threads + " --force -o " + cons_ref2 print cons_cmd2 if not os.path.isfile(cons_ref2): os.system(cons_cmd2) # combine all seqments into one ref... flag = 1 Cons_ref_ful = res_dir + "/" + info_dict[ddd][0] + "-genome.fa" for i in SeqIO.parse(open(cons_ref2, "r"), "fasta"): with open(Cons_ref_ful, "a") as f: print i f.write(">" + info_dict[ddd][1].replace(" ", "") + "|" + i.id.split("-")[1].replace("cons2", "") + "|" + "\r\n" + str(i.seq) + "\r\n") else: print "No Coverage :( " # Map reads to consensus genome.... Bowtie2 if flag == 1: cons_ref2_indx = "bowtie2-build " + Cons_ref_ful + " tmp_idx" os.system(cons_ref2_indx) out_sam = Cons_ref_ful.replace(".fa", ".sam") bowtie2_cmd = "bowtie2 --local --fast-local -x tmp_idx -U " + opts.f1 + " -S " + out_sam + " -p 8" os.system(bowtie2_cmd) # run lofreq2.... bam_cmd = "samtools view -b -S " + out_sam + " > " + out_sam.replace( ".sam", ".bam") bam_sort = "samtools sort " + out_sam.replace( ".sam", ".bam") + " " + out_sam.replace(".sam", "-sort") cmd = "lofreq call -C 100 -f " + Cons_ref_ful + " -o " + out_sam.replace( ".sam", "-snps.vcf") + " " + out_sam.replace(".sam", "-sort.bam") # 100 is min depth required to call SNPs print bam_cmd os.system(bam_cmd) print bam_sort os.system(bam_sort) print cmd os.system(cmd) print "\n\n" else: print "Not this strain..."
for hit in self.rc_hits: # hit[0] x coordinate, hit[3] list of y coordinate x = [hit[0]] * len(hit[3]) plt.scatter(x, hit[3]) for aligned_hit in self.rc_chain: x = [aligned_hit[0]] * len(aligned_hit[3]) plt.scatter(x, aligned_hit[3], edgecolors="black", linewidths=2) plt.show() if __name__ == '__main__': # record1 = SeqIO.read("D:/Data/20170627/missing/missing_query_025.fasta", "fasta") # record2 = SeqIO.read("D:/Data/20170627/missing/missing_target_025.fasta", "fasta") # record1 = SeqIO.read("D:/Data/20170622/9mer_FP/FP_query_002.fasta", "fasta") # record2 = SeqIO.read("D:/Data/20170622/9mer_FP/FP_target_002.fasta", "fasta") record1 = SeqIO.read("D:/Data/20170706/FP_dustboth/FP_query_100.fasta", "fasta") record2 = SeqIO.read("D:/Data/20170706/FP_dustboth/FP_target_100.fasta", "fasta") test_filter = PseudoBloomFilter.PseudoBloomFilter(record2, 9, 54) print test_filter.L test_filter.generate_filter() test_query = QuerySeq(record1) test_query.check_kmer(test_filter) # print(test_query.fw_hits) # print(test_query.rc_hits) test_query.cluster_hits(size_threshold=3, debug=True, group_hit=1.0) print test_query.chain_align print test_query.aligned test_query.plot()
def parse_fasta(): # read and parse the FASTA file return SeqIO.read(args.fasta, 'fasta')
def main(): start_time = time.time() args = parse_args() unique_results_files = list(OrderedDict.fromkeys(args.tables)) list_of_isolates = [] # key1 = (start, end), key2 = isolate, value = +/*/? #list_of_positions = collections.defaultdict(dict) list_of_positions = [] # key1 = (start, end), key2 = ref, value = + #list_of_ref_positions = collections.defaultdict(dict) # key = (start, end), value = orientation (F/R) #position_orientation = {} reference_fasta = args.reference_gbk.split('.g')[0] # Create a fasta file of the reference for BLAST print 'Creating fasta file and database of reference ...' gbk_to_fasta(args.reference_gbk, reference_fasta) # Make a BLAST database blast_db(reference_fasta) # Get the reference positions and orientations for this IS query print '\nGetting query positions in reference ...' list_of_positions, ref_name = get_ref_positions(reference_fasta, args.seq, list_of_positions) elapsed_time = time.time() - start_time print 'Time taken: ' + str(elapsed_time) #print list_of_positions #print ref_name # Loop through each table give to --tables print 'Collating results files ...' for result_file in unique_results_files: # Get isolate name isolate = result_file.split('_table.txt')[0] list_of_isolates.append(isolate) # Skip the header header = 0 with open(result_file) as file_open: for line in file_open: # Skip header if header == 0: header += 1 # Check to make sure there were actually hits elif 'No hits found' not in line and line != '': info = line.strip('\n').split('\t') # Get orientation for hit and start/end coordinates orientation = info[1] is_start = min(int(info[2]), int(info[3])) is_end = max(int(info[3]), int(info[2])) # Note whether call is Known, Novel or Possible related IS call = info[5] # See if this position is already in the list of positions match = False isolate_dict = {} for pos in list_of_positions: if pos.x == is_start and pos.y == is_end and pos.orientation == orientation: # Then this position already exists match = True # And we want to retreive the position to which it is exactly the same matching_pos = pos # Then we want to add the info about this new position to the list if '?' in call: matching_pos.isolate_dict[isolate] = '?' elif '*' in call: matching_pos.isolate_dict[isolate] = '*' else: matching_pos.isolate_dict[isolate] = '+' # So we haven't seen this position before if match == False: # The position list is empty, so there's nothing to check against, so just add # this new position if list_of_positions == []: if '?' in call: isolate_dict[isolate] = '?' elif '*' in call: isolate_dict[isolate] = '*' else: isolate_dict[isolate] = '+' new_pos = Position(is_start, is_end, orientation, isolate_dict, call, None, None) list_of_positions.append(new_pos) # If the list of positions isn't empty, then there are ranges to check against else: if args.tolerance == -1: old_position, new_range = check_ranges(list_of_positions, (is_start, is_end), args.gap, orientation, call, isolate) else: old_position, new_range = check_ranges_tol(list_of_positions, (is_start, is_end), args.tolerance, orientation, call, isolate) # So the current range overlaps with a range we already have if old_position != False: isolate_dict = old_position.isolate_dict # Add the new isolate to this dictionary # Mark as ? if uncertain, * if imprecise # or + if confident if '?' in call: isolate_dict[isolate] = '?' elif '*' in call: isolate_dict[isolate] = '*' else: isolate_dict[isolate] = '+' # Remove the old position from the list list_of_positions.remove(old_position) # Create the new position and add it new_pos = Position(new_range[0], new_range[1], orientation, isolate_dict, call, None, None, old_position.xs, old_position.ys, is_start, is_end) list_of_positions.append(new_pos) # Otherwise this range hasn't been seen before, so all values are False else: if '?' in call: isolate_dict[isolate] = '?' elif '*' in call: isolate_dict[isolate] = '*' else: isolate_dict[isolate] = '+' new_pos = Position(is_start, is_end, orientation, isolate_dict, call, None, None) list_of_positions.append(new_pos) elapsed_time = time.time() - start_time print 'Time taken: ' + str(elapsed_time) list_of_positions = [p for p in list_of_positions if len(p.isolate_dict) > args.drop] print 'Positions: ' + str(len(list_of_positions)) #Check if all hits in every position within tolerance num_bad_positions = 0 if args.tolerance > 0 or args.gap > 0: max_delta = 2*(args.tolerance if args.tolerance > 0 else args.gap) for p in list_of_positions: bad = False if max(p.xs) - min(p.xs) > max_delta: print "Inconsistensy x position in ", (p.x, p.y), " len ", len(p.xs), " delta ", max(p.xs) - min(p.xs) bad = True if max(p.ys) - min(p.ys) > max_delta: print "Inconsistensy y position in ", (p.x, p.y), " len ", len(p.xs), " delta ", max(p.ys) - min(p.ys) bad = True num_bad_positions += bad print "Total bad positions ", num_bad_positions # Get the flanking genes for each position now they've all been merged print 'Getting flanking genes for each position (this step is the longest and could take some time) ...' # key = (start, end), valye = [left_gene, right_gene] position_genes = {} # Get feature list gb = SeqIO.read(args.reference_gbk, "genbank") feature_list = [] feature_count = 0 feature_types = ["CDS", "tRNA", "rRNA"] for feature in gb.features: if feature.type in feature_types: feature_list.append([int(feature.location.start), int(feature.location.end), feature_count]) feature_count += 1 else: feature_count += 1 # Sort the list just in case it's out of order (has caused issues in the past!!) feature_list = sorted(feature_list, key=itemgetter(0)) # Get flanking genes for pos in list_of_positions: genes_before, genes_after = get_flanking_genes(gb.features, feature_list, pos.x, pos.y, args.cds, args.trna, args.rrna, len(gb.seq)) pos.left_feature = genes_before pos.right_feature = genes_after elapsed_time = time.time() - start_time print 'Time taken: ' + str(elapsed_time) # Order positions from smallest to largest for final table output list_of_positions.sort(key=lambda x: x.x) # Write out table print 'Writing output table to ' + args.output + ' ...' with open(args.output, 'w') as out: header = ['isolate'] for pos in list_of_positions: if pos.orientation == 'F': header.append(str(pos.x) + '-' + str(pos.y)) else: header.append(str(pos.y) + '-' + str(pos.x)) out.write('\t'.join(header) + '\n') # Add the values for the reference positions row = [ref_name] for pos in list_of_positions: if ref_name in pos.isolate_dict.keys(): row.append(pos.isolate_dict[ref_name]) else: row.append('-') out.write('\t'.join(row) + '\n') # Loop through each isolate # and create each row for isolate in list_of_isolates: row = [isolate] for pos in list_of_positions: if isolate in pos.isolate_dict.keys(): row.append(pos.isolate_dict[isolate]) else: row.append('-') out.write('\t'.join(row) + '\n') # Set up flanking genes row_orientation = ['orientation'] row_l_locus = ['left ID'] row_r_locus = ['right ID'] row_l_dist = ['left distance'] row_r_dist = ['right distance'] row_l_strand = ['left strand'] row_r_strand = ['right strand'] row_l_prod = ['left info'] row_r_prod = ['right info'] # Print orientation and flanking genes for each position for pos in list_of_positions: row_orientation.append(pos.orientation) row_l_locus.append(pos.left_feature[0]) row_r_locus.append(pos.right_feature[0]) row_l_dist.append(pos.left_feature[1]) row_r_dist.append(pos.right_feature[1]) row_l_strand.append(pos.left_feature[2][-1]) row_r_strand.append(pos.right_feature[2][-1]) row_l_prod.append(pos.left_feature[2]) row_r_prod.append(pos.right_feature[2]) out.write('\t'.join(row_orientation) + '\n') out.write('\t'.join(row_l_locus) + '\n') out.write('\t'.join(row_l_dist) + '\n') out.write('\t'.join(row_l_strand) + '\n') out.write('\t'.join(str(i) for i in row_l_prod) + '\n') out.write('\t'.join(row_r_locus) + '\n') out.write('\t'.join(row_r_dist) + '\n') out.write('\t'.join(row_r_strand) + '\n') out.write('\t'.join(str(i) for i in row_r_prod) + '\n') elapsed_time = time.time() - start_time print 'Table compilation finished in ' + str(elapsed_time)
def main(): sim = pt.Model(cell_volume=CELL_VOLUME) # Download T7 wild-type genbank records Entrez.email = "*****@*****.**" handle = Entrez.efetch(db="nuccore", id=["NC_001604"], rettype="gb", retmode="text") record = SeqIO.read(handle, "genbank") genome_length = len(record.seq) phage = pt.Genome(name="phage", length=genome_length, transcript_degradation_rate=1e-3) #phage = pt.Genome(name="phage", length=genome_length) for feature in record.features: weights = [0.0] * len(record.seq) # Convert to inclusive genomic coordinates start = feature.location.start.position + 1 stop = feature.location.end.position name = '' if "note" in feature.qualifiers: name = feature.qualifiers["note"][0] # Grab promoters and terminators if feature.type == "regulatory": if name in IGNORE_REGULATORY: continue # Construct promoter if "promoter" in feature.qualifiers["regulatory_class"]: length = stop - start if length < 35: start = start - 35 interactions = get_promoter_interactions(name) phage.add_promoter(name, start, stop, interactions) # Construct terminator params if "terminator" in feature.qualifiers["regulatory_class"]: interactions = get_terminator_interactions(name) phage.add_terminator(name, start, stop, interactions) # Grab genes/CDSes if feature.type == "gene": if name in IGNORE_GENES: continue if name in RELABEL_GENES: name = RELABEL_GENES[name] # Construct CDS parameters for this gene phage.add_gene(name=name, start=start, stop=stop, rbs_start=start - 30, rbs_stop=start, rbs_strength=1e7) if feature.type == "CDS": weights = compute_cds_weights(record, feature, 1.0, weights) if feature.type == "misc_structure": print(feature.qualifiers) phage.add_rnase_site(start=start, stop=start + 10) print(start, stop, name) mask_interactions = ["rnapol-1", "rnapol-3.5", "ecolipol", "ecolipol-p", "ecolipol-2", "ecolipol-2-p"] phage.add_mask(500, mask_interactions) norm_weights = normalize_weights(weights) phage.add_weights(norm_weights) sim.register_genome(phage) sim.add_polymerase("rnapol-1", 35, 230, 0) sim.add_polymerase("rnapol-3.5", 35, 230, 0) sim.add_polymerase("ecolipol", 35, 45, 0) sim.add_polymerase("ecolipol-p", 35, 45, 0) sim.add_polymerase("ecolipol-2", 35, 45, 0) sim.add_polymerase("ecolipol-2-p", 35, 45, 0) sim.add_polymerase("ribosome", 30, 30, 0) sim.add_species("bound_ribosome", 10000) sim.add_species("bound_ecolipol", 1800) sim.add_species("bound_ecolipol_p", 0) sim.add_species("ecoli_genome", 0) sim.add_species("ecoli_transcript", 0) sim.add_reaction(1e6, ["ecoli_transcript", "ribosome"], ["bound_ribosome"]) sim.add_reaction(0.04, ["bound_ribosome"], [ "ribosome", "ecoli_transcript"]) sim.add_reaction(0.001925, ["ecoli_transcript"], ["degraded_transcript"]) sim.add_reaction(1e7, ["ecolipol", "ecoli_genome"], ["bound_ecolipol"]) sim.add_reaction( 0.3e7, ["ecolipol-p", "ecoli_genome"], ["bound_ecolipol_p"]) sim.add_reaction(0.04, ["bound_ecolipol"], [ "ecolipol", "ecoli_genome", "ecoli_transcript"]) sim.add_reaction(0.04, ["bound_ecolipol_p"], [ "ecolipol-p", "ecoli_genome", "ecoli_transcript"]) sim.add_reaction(3.8e7, ["protein_kinase-0.7", "ecolipol"], ["ecolipol-p", "protein_kinase-0.7"]) sim.add_reaction(3.8e7, ["protein_kinase-0.7", "ecolipol-2"], ["ecolipol-2-p", "protein_kinase-0.7"]) sim.add_reaction(3.8e7, ["gp-2", "ecolipol"], ["ecolipol-2"]) sim.add_reaction(3.8e7, ["gp-2", "ecolipol-p"], ["ecolipol-2-p"]) sim.add_reaction(1.1, ["ecolipol-2-p"], ["gp-2", "ecolipol-p"]) sim.add_reaction(1.1, ["ecolipol-2"], ["gp-2", "ecolipol"]) sim.add_reaction(3.8e9, ["lysozyme-3.5", "rnapol-1"], ["rnapol-3.5"]) sim.add_reaction(3.5, ["rnapol-3.5"], ["lysozyme-3.5", "rnapol-1"]) sim.seed(72) # sim.run(stop_time=1500, time_step=5, output_prefix="test") sim.run(stop_time=1500, time_step=5, output_prefix="degrade_test3")
#! /usr/bin/env python # gb2tbl.py #This script converts a genbank flat file to a features table suitable for use with Sequin. #Usage gb2tbl.py <genbank flatfile name> #Writes to standard output so redirect to a file if desired #Aaron M. Duffy aduffy70{at}gmail.com #May 2010 from Bio import SeqIO # tools for parsing genbank files from sys import argv # a list of command line arguments import re # tools for working with regular expressions #Read the genbank flat file gbFile = open(argv[1], 'r') gbRecord = SeqIO.read(gbFile, 'genbank') #Print the header row print ">Feature gb|%s|" % gbRecord.name #Setup a pattern match to filter out "Geneious name:" lines pattern = re.compile('Geneious name') #Format and print each feature except the first one (it is summary data for the whole sequence) for feature in gbRecord.features[1:]: if (len(feature.sub_features) > 0): # Handle features with no subfeatures firstSubFeature = True orderedSubfeatures = feature.sub_features # for subfeature in orderedSubfeatures: if (subfeature.strand == -1): # reverse strand start = subfeature.location.nofuzzy_end stop = subfeature.location.nofuzzy_start + 1 # adjust for the python 0-index
if seq not in cache: # iterate over range by 2's as we don't want odd lengths tmp = [] for k in range(1, len(seq), 2): ''' Multiply first half of the string * the first nt and ending nt of first half * second half This multiplication is to combine the number of noncrossing perfect matches from the subproblems. The actual value/counts comes from the dynamically generated dictionary. ''' tmp.append(countRNA2Structures(seq[1:k]) * cache[seq[0]+seq[k]] * countRNA2Structures(seq[k+1:])) # assign current sequence into dictionary for later use cache[seq] = sum(tmp) return cache[seq] if __name__ == "__main__": from Bio import SeqIO f = open("/Rosalind/data/rosalind_cat.txt", 'r') raw = SeqIO.read(f, "fasta") f.close() rna = str(raw.seq) # set up initial dictionary for number of matches for the sequence cache = {'':1, 'A':0, 'C':0, 'G':0, 'U':0, 'AA':0, 'AC':0, 'AG':0, 'AU':1, 'CA':0, 'CC':0, 'CG':1, 'CU':0, 'GA':0, 'GC':1, 'GG':0, 'GU':0, 'UA':1, 'UC':0, 'UG':0, 'UU':0} print countRNA2Structures(rna) % 10**6
print('Resetting ClusterCAD database.') [cluster.delete() for cluster in pks.models.Cluster.objects.all()] print('ClusterCAD database reset.') # Assumes that chemical structures have already been aggregated allknowncompounds = pickle.load( open('./data/compounds/all_known_products.p', 'rb')) #for accession in ['BGC0000031']: # Debug with Borreledin for accession in mibigaccessions: # Use accession number to get paths to MIBiG and antiSMASH files mibigfile = os.path.join(mibigpath, accession + '.json') clusterfile = os.path.join(antismashpath, accession + '.embl') # Read antiSMASH annotations for cluster record = SeqIO.read(clusterfile, "embl") description = record.description.replace(' biosynthetic gene cluster', '') # Get compound information try: compound = allknowncompounds[accession] # If compound is missing, we skip the cluster except KeyError: print('Missing compound %s: %s.' % (accession, description)) continue knownproductsmiles = compound[0][0] knownproductsource = compound[1] # Enter information in ClusterCAD database try: cluster = pks.models.Cluster(
sys.exit(0) table_f, need_table, out_f = sys.argv[1:4] fh_in = open(table_f, 'r') fh_out = open(out_f, 'w') for gi_l in fh_in: gi_l = gi_l.rstrip() gi, table = gi_l.split("\t") if table != need_table: continue Entrez.email = "*****@*****.**" handle = Entrez.efetch(db="nucleotide", rettype="gb", retmote="text", id=gi) seq_record = SeqIO.read(handle, "gb") handle.close() line = "" line = seq_record.annotations['taxonomy'] line = "\t".join(line) print(gi, line, sep="\t", file=fh_out) # do not post requests more than 3 times per second, # or your IP will be blocked by NCBI!! time.sleep(0.5) fh_in.close() fh_out.close()
def process_fast5( oper, db, connection_pool, args, ref_fasta_hash, dbcheckhash, filepath, hdf, dbname, cursor, ): try: checksum = hashlib.md5(open(filepath, 'rb').read()).hexdigest() except: err_string = "process_fast5(): error checksum ", filepath print >> sys.stderr, err_string sys.exit() # print checksum, type(checksum) # ## find the right basecall_2D location, get configuaration genral data, and define the basename. """basecalltype = 'Basecall_1D_CDNA' basecalltype2 = 'Basecall_2D' basecalldir = '' basecalldirconfig = '' # print "REF", ref_fasta_hash for x in range(0, 9): string = '/Analyses/%s_00%s/Configuration/general' \ % (basecalltype, x) if string in hdf: basecalldir = '/Analyses/%s_00%s/' % (basecalltype, x) basecalldirconfig = string break string = '/Analyses/%s_00%s/Configuration/general' \ % (basecalltype2, x) if string in hdf: basecalldir = '/Analyses/%s_00%s/' % (basecalltype2, x) basecalldirconfig = string break """ file_type = check_read_type(filepath, hdf) #print "FILETYPE is", file_type if file_type == 2: basecalltype = "Basecall_1D" #ML basecalltype2 = "Basecall_2D" basecalldir = '' basecalldirconfig = '' basecallindexpos = '' #ML string2 = '' #ML for x in range(0, 9): string2 = '/Analyses/Hairpin_Split_00%s/Configuration/general' % ( x) #ML if (string2 in hdf): basecallindexpos = x #ml #print "BASECALLINDEXPOS",basecallindexpos basecalldirconfig = string2 #ML string = '/Analyses/%s_00%s/Configuration/general' % (basecalltype, basecallindexpos) #print string if (string in hdf): # print "YES 1" basecalldir = '/Analyses/%s_00%s/' % (basecalltype, basecallindexpos) #basecallindexpos=x #ml #break string = '/Analyses/%s_00%s/Configuration/general' % (basecalltype2, basecallindexpos) #print string if (string2 in hdf): #print "YES 2" basecalldir = '/Analyses/%s_00%s/' % (basecalltype2, basecallindexpos) #basecalldirconfig=string2 #ML #break if file_type == 1: basecalltype = 'Basecall_1D_CDNA' basecalltype2 = 'Basecall_2D' basecalldir = '' basecalldirconfig = '' basecallindexpos = '' for x in range(0, 9): string = '/Analyses/%s_00%s/Configuration/general' \ % (basecalltype, x) if string in hdf: basecalldir = '/Analyses/%s_00%s/' % (basecalltype, x) basecalldirconfig = string basecallindexpos = x break string = '/Analyses/%s_00%s/Configuration/general' \ % (basecalltype2, x) if string in hdf: basecalldir = '/Analyses/%s_00%s/' % (basecalltype2, x) basecalldirconfig = string basecallindexpos = x break configdata = hdf[basecalldirconfig] basename = configdata.attrs[ 'basename'] # = PLSP57501_17062014lambda_3216_1_ch101_file10_strand # # get all the tracking_id data, make primary entry for basename, and get basenameid tracking_id_fields = [ 'basename', 'asic_id', 'asic_id_17', 'asic_id_eeprom', 'asic_temp', 'device_id', 'exp_script_purpose', 'exp_script_name', 'exp_start_time', 'flow_cell_id', 'heatsink_temp', 'hostname', 'run_id', 'version_name', ] tracking_id_hash = make_hdf5_object_attr_hash( args, hdf['/UniqueGlobalKey/tracking_id'], tracking_id_fields) tracking_id_hash.update({ 'basename': basename, 'file_path': filepath, 'md5sum': checksum }) hdf5object = hdf['/UniqueGlobalKey/channel_id'] # print "Got event location" for x in ('channel_number', 'digitisation', 'offset', 'sampling_rate'): if x in hdf5object.attrs.keys(): value = str(hdf5object.attrs[x]) # print x, value tracking_id_hash.update({x: value}) # range is a specifal case: # for x in ('range'): # if (x in hdf5object.attrs.keys() ): # value=str(hdf5object.attrs[x]) # print x, value # tracking_id_hash.update({'range_val ':value}) passcheck = 0 if '/pass/' in filepath: passcheck = 1 if '\\pass\\' in filepath: passcheck = 1 tracking_id_hash.update({'pass': passcheck}) basenameid = mysql_load_from_hashes(db, cursor, 'tracking_id', tracking_id_hash) # # get all the data from Configuration/general, then add Event Detection mux pore number general_fields = [ 'basename', 'local_folder', 'workflow_script', 'workflow_name', 'read_id', 'use_local', 'tag', 'model_path', 'complement_model', 'max_events', 'input', 'min_events', 'config', 'template_model', 'channel', 'metrichor_version', 'metrichor_time_stamp', ] general_hash = make_hdf5_object_attr_hash(args, configdata, general_fields) general_hash.update({'basename_id': basenameid}) if (len(basecalldir) > 0): #ML metrichor_info = hdf[basecalldir] #ML try: general_hash.update({ 'metrichor_version': metrichor_info.attrs['chimaera version'], 'metrichor_time_stamp': metrichor_info.attrs['time_stamp'] }) #ML except: general_hash.update({ 'metrichor_version': metrichor_info.attrs['version'], 'metrichor_time_stamp': metrichor_info.attrs['time_stamp'] }) #ML else: #ML general_hash.update({ 'metrichor_version': 'N/A', 'metrichor_time_stamp': '' }) #ML # # get event detection for the read; define mux pore nuber eventdectionreadstring = \ '/Analyses/EventDetection_000/Reads/Read_%s' \ % general_hash['read_id'] if eventdectionreadstring in hdf: hdf5object = hdf[eventdectionreadstring] # print "Got event location" for x in ( 'start_mux', 'end_mux', 'abasic_event_index', 'abasic_found', 'abasic_peak_height', 'duration', 'hairpin_event_index', 'hairpin_found', 'hairpin_peak_height', 'hairpin_polyt_level', 'median_before', 'read_number', 'scaling_used', 'start_time', ): if x in hdf5object.attrs.keys(): value = str(hdf5object.attrs[x]) # print x, value general_hash.update({x: value}) # Specific to catch read_id as different class: for x in 'read_id': if x in hdf5object.attrs.keys(): value = str(hdf5object.attrs[x]) # print 'read_name', value general_hash.update({'read_name': value}) # Add pass flag to general_hash general_hash.update({'pass': passcheck}) general_hash.update( {'exp_start_time': tracking_id_hash['exp_start_time']}) general_hash.update({ '1minwin': int(hdf5object.attrs['start_time'] / float(tracking_id_hash['sampling_rate']) / 60) }) # '1minwin':int(template_start/(60)) general_hash.update({ '5minwin': int(hdf5object.attrs['start_time'] / float(tracking_id_hash['sampling_rate']) / 60 / 5) }) # '1minwin':int(template_start/(60)) general_hash.update({ '10minwin': int(hdf5object.attrs['start_time'] / float(tracking_id_hash['sampling_rate']) / 60 / 10) }) # '1minwin':int(template_start/(60)) general_hash.update({ '15minwin': int(hdf5object.attrs['start_time'] / float(tracking_id_hash['sampling_rate']) / 60 / 15) }) # '1minwin':int(template_start/(60)) # if ('start_mux' in hdf5object.attrs.keys() ): # start_mux=str(hdf5object.attrs['start_mux']) # print "start_mux", start_mux # general_hash.update({'start_mux':start_mux}) # if ('end_mux' in hdf5object.attrs.keys() ): # stop_mux=str(hdf5object.attrs['end_mux']) # print "stop_mux", stop_mux # general_hash.update({'end_mux':stop_mux}) # ## load general_hash into mysql mysql_load_from_hashes(db, cursor, 'config_general', general_hash) # # get all the basecall summary split hairpin data basecall_summary_fields = [ 'abasic_dur', 'abasic_index', 'abasic_peak', 'duration_comp', 'duration_temp', 'end_index_comp', 'end_index_temp', 'hairpin_abasics', 'hairpin_dur', 'hairpin_events', 'hairpin_peak', 'median_level_comp', 'median_level_temp', 'median_sd_comp', 'median_sd_temp', 'num_comp', 'num_events', 'num_temp', 'pt_level', 'range_comp', 'range_temp', 'split_index', 'start_index_comp', 'start_index_temp', ] if file_type == 1: basecall_summary_hash = make_hdf5_object_attr_hash( args, hdf[basecalldir + 'Summary/split_hairpin'], basecall_summary_fields) if file_type == 2: basecall_summary_hash = make_hdf5_object_attr_hash( args, hdf['/Analyses/Hairpin_Split_00' + str(basecallindexpos) + '/Summary/split_hairpin'], basecall_summary_fields) #print '/Analyses/Hairpin_Split_00'+str(basecallindexpos)+'/Summary/split_hairpin' #print basecall_summary_hash # # adding info about other the basecalling itself if basecalldir + 'Summary/basecall_1d_complement' in hdf: hdf5object = hdf[basecalldir + 'Summary/basecall_1d_complement'] # print "Got event location" for x in ( 'drift', 'mean_qscore', 'num_skips', 'num_stays', 'scale', 'scale_sd', 'sequence_length', 'shift', 'strand_score', 'var', 'var_sd', ): if x in hdf5object.attrs.keys(): value = str(hdf5object.attrs[x]) # print x, value basecall_summary_hash.update({x + 'C': value}) # # adding info about other the basecalling itself if basecalldir + 'Summary/basecall_1d_template' in hdf: hdf5object = hdf[basecalldir + 'Summary/basecall_1d_template'] # print "Got event location" for x in ( 'drift', 'mean_qscore', 'num_skips', 'num_stays', 'scale', 'scale_sd', 'sequence_length', 'shift', 'strand_score', 'var', 'var_sd', ): if x in hdf5object.attrs.keys(): value = str(hdf5object.attrs[x]) # print x, value basecall_summary_hash.update({x + 'T': value}) if basecalldir + 'Summary/basecall_2d' in hdf: hdf5object = hdf[basecalldir + 'Summary/basecall_2d'] # print "Got event location" for x in ('mean_qscore', 'sequence_length'): if x in hdf5object.attrs.keys(): value = str(hdf5object.attrs[x]) # print x, value basecall_summary_hash.update({x + '2': value}) # # Adding key indexes and time stamps basecall_summary_hash.update({'basename_id': basenameid}) basecall_summary_hash.update({'pass': passcheck}) basecall_summary_hash.update( {'exp_start_time': tracking_id_hash['exp_start_time']}) basecall_summary_hash.update({'1minwin': general_hash['1minwin']}) basecall_summary_hash.update({'5minwin': general_hash['5minwin']}) basecall_summary_hash.update({'10minwin': general_hash['10minwin']}) basecall_summary_hash.update({'15minwin': general_hash['15minwin']}) # print basecall_summary_hash # # load basecall summary hash into mysql mysql_load_from_hashes(db, cursor, 'basecall_summary', basecall_summary_hash) # # see if there is any barcoding info to addd barcode_hash = dict() for x in range(0, 9): string = '/Analyses/Barcoding_00%s/Summary/barcoding' % x # print string if string in hdf: # print "barcode", string barcode_hash = make_hdf5_object_attr_hash(args, hdf[string], ( 'pos0_start', 'score', 'design', 'pos1_end', 'pos0_end', 'pos1_start', 'variant', 'barcode_arrangement', )) barcode_hash.update({'basename_id': basenameid}) mysql_load_from_hashes(db, cursor, 'barcode_assignment', barcode_hash) # print barcode_hash # for bk in barcode_hash.keys(): # print bk, barcode_hash[bk], type(barcode_hash[bk]) break # ------------ Do model details ------------------- if args.telem is True: if dbname not in dbcheckhash['modelcheck']: dbcheckhash['modelcheck'][dbname] = dict() log_string = basecalldir + 'Log' if log_string in hdf: log_data = str(hdf[log_string][()]) # print type(log), log lines = log_data.split('\n') template_model = None complement_model = None for l in lines: t = re.match('.*Selected model: "(.*template.*)".', l) if t: template_model = t.group(1) c = re.match('.*Selected model: "(.*complement.*)".', l) if c: complement_model = c.group(1) if template_model is not None: sql = \ "INSERT INTO %s (basename_id,template_model,complement_model) VALUES ('%s','%s',NULL)" \ % ('model_list', basenameid, template_model) if template_model not in dbcheckhash['modelcheck'][dbname]: location = basecalldir + 'BaseCalled_template/Model' if location in hdf: upload_model_data('model_data', template_model, location, hdf, cursor, db) dbcheckhash['modelcheck'][dbname][template_model] = 1 if complement_model is not None: sql = \ "INSERT INTO %s (basename_id,template_model,complement_model) VALUES ('%s','%s','%s')" \ % ('model_list', basenameid, template_model, complement_model) if complement_model not in dbcheckhash['modelcheck'][ dbname]: location = basecalldir \ + 'BaseCalled_complement/Model' if location in hdf: upload_model_data('model_data', complement_model, location, hdf, cursor, db) dbcheckhash['modelcheck'][dbname][ complement_model] = 1 cursor.execute(sql) db.commit() # --------------------------------------------------------------------------- if file_type == 1: readtypes = {'basecalled_template': basecalldir \ + 'BaseCalled_template/', 'basecalled_complement': basecalldir \ + 'BaseCalled_complement/', 'basecalled_2d': basecalldir + 'BaseCalled_2D/'} if file_type == 2: readtypes = { 'basecalled_template': '/Analyses/Basecall_1D_00' + str(basecallindexpos) + "/" + 'BaseCalled_template/', 'basecalled_complement': '/Analyses/Basecall_1D_00' + str(basecallindexpos) + "/" + 'BaseCalled_complement/', 'basecalled_2d': '/Analyses/Basecall_2D_00' + str(basecallindexpos) + "/" + 'BaseCalled_2D/' } #ML fastqhash = dict() # tel_sql_list=list() tel_data_hash = dict() template_start = 0 for (readtype, location) in readtypes.iteritems(): if location in hdf: fastq = hdf[location + 'Fastq'][()] try: rec = SeqIO.read(StringIO(fastq), 'fastq') except Exception, err: err_string = \ '%s:\tError reading fastq oject from base: %s type: %s error: %s' \ % (time.strftime('%Y-%m-%d %H:%M:%S'), basename, readtype, err) print >> sys.stderr, err_string with open(dbcheckhash['logfile'][dbname], 'a') as \ logfilehandle: logfilehandle.write(err_string + os.linesep) logfilehandle.close() continue sequence = str(rec.seq) seqlen = len(sequence) rec.id = basename + '.' + readtype qual = chr_convert_array(db, rec.letter_annotations['phred_quality']) fastqhash[rec.id] = \ {'quals': rec.letter_annotations['phred_quality'], 'seq': sequence} if location + 'Alignment' in hdf: # so its 2D # print "we're looking at a 2D read",template_start,"\n\n" mysql_load_from_hashes( db, cursor, readtype, { 'basename_id': basenameid, 'seqid': rec.id, 'sequence': sequence, 'qual': qual, 'start_time': template_start, 'seqlen': seqlen, 'exp_start_time': tracking_id_hash['exp_start_time'], '1minwin': int(template_start / 60), '5minwin': int(template_start / (5 * 60)), '10minwin': int(template_start / (10 * 60)), '15minwin': int(template_start / (15 * 60)), 'pass': passcheck, }) if args.telem is True: alignment = hdf[location + 'Alignment'][()] # print "ALIGNMENT", type(alignment) channel = general_hash['channel'][-1] tel_data_hash[readtype] = [basenameid, channel, alignment] # upload_2dalignment_data(basenameid,channel,alignment,db) # tel_sql_list.append(t_sql) complement_and_template_fields = [ 'basename', 'seqid', 'duration', 'start_time', 'scale', 'shift', 'gross_shift', 'drift', 'scale_sd', 'var_sd', 'var', 'sequence', 'qual', ] if location + 'Events' in hdf and location + 'Model' in hdf: # so its either template or complement events_hash = make_hdf5_object_attr_hash( args, hdf[location + 'Events'], complement_and_template_fields) model_hash = make_hdf5_object_attr_hash( args, hdf[location + 'Model'], complement_and_template_fields) # #Logging the start time of a template read to pass to the 2d read in order to speed up mysql processing if readtype == 'basecalled_template': template_start = events_hash['start_time'] events_hash.update(model_hash) events_hash.update({ 'basename_id': basenameid, 'seqid': rec.id, 'sequence': sequence, 'qual': qual, 'seqlen': seqlen, '1minwin': int(events_hash['start_time'] / 60), '5minwin': int(events_hash['start_time'] / (5 * 60)), '10minwin': int(events_hash['start_time'] / (10 * 60)), '15minwin': int(events_hash['start_time'] / (15 * 60)), }) events_hash.update({ 'exp_start_time': tracking_id_hash['exp_start_time'], 'pass': passcheck }) mysql_load_from_hashes(db, cursor, readtype, events_hash) # -------- This inserts telemetry data. It is optional under the flags above. # -------- Modified to calculate some means and averages # ------- so we are going to do this everytime # if (args.telem is True): # print "start telem", (time.time())-starttime # ## Do Events events = hdf[location + 'Events'][()] tablechannel = readtype + '_' + general_hash['channel'][-1] tel_data_hash[readtype] = [basenameid, tablechannel, events]
print(line) try: handle = Entrez.efetch(db="nucleotide", id=str(line), rettype="gb", retmode="text") except urllib.error.HTTPError as exception: print('error with entrez connection, trying again') time.sleep(2) handle = Entrez.efetch(db="nucleotide", id=str(line), rettype="gb", retmode="text") x = SeqIO.read( handle, 'genbank' ) # get information regarding your accesion number in here will be taxonomy tax = x.annotations['taxonomy'] # only get taxonomy taxf = ";".join(tax) #join taxonomy based on ';' character full_lineage = ( taxf + ';' + x.annotations['organism'] ) # but i also want the organism name so this will also add organism specific name line = line.strip() lineage_info[line] = full_lineage print("You have " + str(len(lineage_info)) + ' accesion numbers') """now open the fasta file containing the headers you want to change. This is going to be done on my viral db which was downloaded from ftp://ftp.ncbi.nlm.nih.gov/refseq/release/viral/ . This downloaded dataset contains fasta files and header is AccesionNumber and Organism name. I want to change it to AccesionNumber and Taxonomic lineage. You can remove the accesion number and add to a new file using the following commands: sed 's/\s.*$//' viral_all.fna | grep ">" | sed 's/>//' > Viral_accesion_numbers.txt
from Bio import SeqIO fw=open(sys.argv[2],"w") fw.write('Gene_ID\t'+'A(bases)\t'+'C(bases)\t'+'G(bases)\t'+'T(bases)'+'\t'+'N(bases)'+'\t'+'N_percentage(%)''\n') biggestN={} recordall={} with open(sys.argv[1]) as IN: for seq_record in SeqIO.parse(IN, "fasta"): #print('>'+seq_record.id+' '+str(len(seq_record))) #print(str(seq_record.seq)) #SeqIO.write(seq_record, fw, "fasta") Tempcount=[] #print(seq_record.seq.upper().count('A')) for nt in ('A','T','C','G','N'): Tempcount.append(seq_record.seq.upper().count(nt)) totallength=len(seq_record.seq) N_percentage=round((Tempcount[-1]/int(totallength))*100.0,2) Tempcount.append(N_percentage) line='\t'.join(map(str, Tempcount)) result_count=seq_record.id+'\t'+line biggestN[seq_record.id]=Tempcount[-2] recordall[seq_record.id]=result_count for key,value in sorted(biggestN.items(),key=lambda x:x[1],reverse=True): if key in recordall.keys(): print >>fw,"{}".format(recordall[key]) fw.close() ''' record = SeqIO.read(sys.argv[1], "fasta") print('>'+record.id) print(record.seq)
aln_fname = '../data/' + flutype + '_HA1_all_years_filtered.fasta.gz' if flutype.startswith('H3N2'): cds = {'begin': 0, 'end': 987, 'pad': 0} else: cds = {'begin': 0, 'end': 300, 'pad': 0} if os.path.isfile('../data/' + flutype + '_L_L_predictions.pickle'): with open('../data/' + flutype + '_L_L_predictions.pickle') as infile: laessig_prediction = pickle.load(infile) # open annotations file with open('../data/' + flutype + '_annotations.pickle', 'r') as infile: annotation = pickle.load(infile) outgroup = SeqIO.read('../data/' + flutype + '_outgroup.fasta', 'fasta') bin_dt = 105 #time bins in days. 3*105 = 315 days approx 10 month years = range(1995, 2012) predictions = {} for year in years: if "oceania" in test_regions: prediction_set = { 'start': date(year - 2, 10, 1), 'stop': date(year - 1, 9, 30), 'regions': prediction_regions, 'sample_size': sample_size } test_set = { 'start': date(year, 3, 1), 'stop': date(year, 9, 30),
def __init__(self,min_length = 900, **kwargs): ''' parameters min_length -- minimal length for a sequence to be acceptable ''' flu_filter.__init__(self, **kwargs) self.min_length = min_length self.vaccine_strains =[ { "strain": "A/Wisconsin/67/2005", "db": "IRD", "accession": "CY163984", "date": "2005-08-31", "region": "north_america", "country": "usa", "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGGAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACGATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATGGAACAAGCTCTTCTTGCAAAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAATGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTTCAAAATGTAAACAGGATCACATATGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCAATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTCGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAGAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCATGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAAGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" }, { "strain": "A/Brisbane/10/2007", "db": "IRD", "accession": "CY113005", "date": "2007-02-06", "region": "oceania", "country": "australia", "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCACTCAAAAACTTCCCGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAATAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAAATTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAAAAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAATGACCAAATCTTCCCGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACTGTAATCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCCAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAACGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAATAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACCAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACAATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGCGTTGAGCTGAAGTCAGGATACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" }, { "strain": "A/Perth/16/2009", "db": "IRD", "accession": "GQ293081", "date": "2009-04-07", "region": "oceania", "country": "australia", "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAGTTCCTCAACAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAAAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGCAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGCTCTGCTTGCATAAGGAGATCTAAAAACAGTTTCTTTAGTAGATTGAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAAGACCAAATCTTCCTGTATGCTCAAGCATCAGGAAGAATCACAGTCTCTACCAAAAGAAGCCAACAAACCGTAAGCCCGAATATCGGATCTAGACCCAGAGTAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAACACTCTGAAATTGGCAACAGGGATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATCGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATAGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTTCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAAGGCAACATTAGGTGCAACATTTGCATTTGA" }, { "strain": "A/Victoria/361/2011", "db": "IRD", "accession": "GQ293081", "date": "2011-10-24", "region": "oceania", "country": "australia", "seq": "ATGAAGACTATCATTGCTTTGAGCCACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAACAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCAAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCGCTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGTTACGGACAAGGAACAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATATAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAATTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTCGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTAAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA" }, { "strain": "A/Texas/50/2012", "db": "GISAID", "isolate_id": "EPI_ISL_129858", "date": "2012-04-15", "region": "north_america", "country": "usa", "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAACTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTCCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGAATGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAATAATAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTTCAAATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAACCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGAATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGGAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGAGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", }, { "strain": "A/Switzerland/9715293/2013", "db": "GISAID", "isolate_id": "EPI_ISL_162149", "date": "2013-12-06", "region": "europe", "country": "switzerland", "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAACTTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGGCTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGGGATCTAATAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTCCAAATACCCAGCATTAAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCACAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCGAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAGACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGCTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGGAGAATTCAGGACCTTGAGAAATATGTTGAGGACACAAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATCAGAAATGGAACTTATGACCACGATGTATACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", }, { "strain": "A/HongKong/4801/2014", "db": "GISAID", "isolate_id": "EPI_ISL_165554", "date": "2014-02-26", "region": "china", "country": "hong_kong", "seq": "ATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACGAATGACCGAATTGAAGTTACTAATGCTACTGAGCTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTGCTTGCATAAGGAGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACATACCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGAAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGCAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATAAGAAATGGAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGGTGCAACATTTGCATTTGA", }, { "strain": "A/Alaska/232/2015", "db": "GISAID", "isolate_id": "EPI787411", "date": "2015-09-09", "region": "north_america", "country": "usa", "seq": "GGATAATTCTATTAACCATGAAGACTATCATTGCTTTGAGCTACATTCTATGTCTGGTTTTCGCTCAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAAAACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACAGTCCTCATCAGATCCTTGATGGAGAGAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAAAATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTCCCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACGGAACAAGTTCTTCTTGCATAAGGAGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACAAATATCCAGCATTGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTACGGACAAGGACCAAATCTACCCGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAATCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGAGACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAATAATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCATTCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATGCGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGTGGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAATCGATCAAATCAATGGGAAGCTGAATCGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCAGAAGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCTTCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGCAACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCAATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCTGAAGTCAGGGTACAAAGATTGGATCCTATGGATTTCCTTTGCCATATCATGTTTTTTGCTTTGTGTTGCTTTGTTGGGGTTCATCATGTGGGCCTGCCAAAAGGGCAACATTAGATGCAACATTTGCATTTGAGTGCATTAATTAAAAACAC" } ] tmp_outgroup = SeqIO.read('source-data/H3N2_outgroup.gb', 'genbank') genome_annotation = tmp_outgroup.features self.cds = {x.qualifiers['gene'][0]:x for x in genome_annotation if 'gene' in x.qualifiers and x.type=='CDS' and x.qualifiers['gene'][0] in ['SigPep', 'HA1', 'HA2']} self.outgroup = { 'strain': 'A/Beijing/32/1992', 'db': 'IRD', 'accession': 'U26830', 'date': '1992-01-01', 'country': 'china', 'region': 'china', 'seq': str(tmp_outgroup.seq).upper() }
def get_raw_check(self, filename, format, alphabet, comp): #Also checking the key_function here if comp: h = gzip.open(filename, "rb") raw_file = h.read() h.close() h = gzip_open(filename, format) id_list = [rec.id.lower() for rec in SeqIO.parse(h, format, alphabet)] h.close() else: h = open(filename, "rb") raw_file = h.read() h.close() id_list = [rec.id.lower() for rec in SeqIO.parse(filename, format, alphabet)] if format in ["sff"]: with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonParserWarning) rec_dict = SeqIO.index(filename, format, alphabet, key_function = lambda x : x.lower()) else: rec_dict = SeqIO.index(filename, format, alphabet, key_function = lambda x : x.lower()) self.assertEqual(set(id_list), set(rec_dict)) self.assertEqual(len(id_list), len(rec_dict)) for key in id_list: self.assertTrue(key in rec_dict) self.assertEqual(key, rec_dict[key].id.lower()) self.assertEqual(key, rec_dict.get(key).id.lower()) raw = rec_dict.get_raw(key) self.assertTrue(raw.strip()) self.assertTrue(raw in raw_file) rec1 = rec_dict[key] #Following isn't very elegant, but it lets me test the #__getitem__ SFF code is working. if format in SeqIO._BinaryFormats: handle = BytesIO(raw) else: handle = StringIO(_bytes_to_string(raw)) if format == "sff": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=False) elif format == "sff-trim": rec2 = SeqIO.SffIO._sff_read_seq_record(handle, rec_dict._proxy._flows_per_read, rec_dict._proxy._flow_chars, rec_dict._proxy._key_sequence, rec_dict._proxy._alphabet, trim=True) elif format == "uniprot-xml": self.assertTrue(raw.startswith(_as_bytes("<entry "))) self.assertTrue(raw.endswith(_as_bytes("</entry>"))) #Currently the __getitem__ method uses this #trick too, but we hope to fix that later raw = """<?xml version='1.0' encoding='UTF-8'?> <uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd"> %s </uniprot> """ % _bytes_to_string(raw) handle = StringIO(raw) rec2 = SeqIO.read(handle, format, alphabet) else: rec2 = SeqIO.read(handle, format, alphabet) self.assertEqual(True, compare_record(rec1, rec2)) rec_dict.close() del rec_dict
# Only doing a_vs_b here, could also have b_vs_c and c_vs_d etc genomes = [ (os.path.join(input_folder, file_a), format_a), (os.path.join(input_folder, file_b), format_b), ] comparisons = [os.path.join(input_folder, file_a_vs_b)] # Create diagram with tracks, each with a feature set assert len(genomes) >= 2 and len(genomes) == len(comparisons) + 1 gd_diagram = Diagram(name, track_size=0.35, circular=False) tracks = dict() feature_sets = dict() records = dict() for f, format in genomes: records[f] = SeqIO.read(f, format) tracks[f] = gd_diagram.new_track(1, name=f, start=0, end=len(records[f]), scale_smalltick_interval=1000, scale_largetick_interval=10000, greytrack=True, greytrack_labels=0) feature_sets[f] = tracks[f].new_set() print("Drawing matches...") for i, crunch_file in enumerate(comparisons): q = genomes[i + 1][0] # query file s = genomes[i][0] # subject file q_set = feature_sets[q]
def find_homologs(): """Predict homologs of PPRs in other genomes based on footprints""" pprs = load_pprs() plastids = load_plastids(exclude=[ "Arabidopsis thaliana", ]) known_binding = SeqIO.read("output/ARA_annotated.gb", "gb") exact_features = [ f for f in known_binding.features if "exact" in f.type.lower() ] ara_genes = [f for f in known_binding.features if f.type.lower() == "gene"] ara_genes.sort(key=lambda g: g.location.start) print "Loaded {} pprs and {} plastids".format(len(pprs), len(plastids)) for k, ppr in enumerate(pprs): print "Searching for homologs of \'{}\' ({}/{})".format( ppr.name, k + 1, len(pprs)) footprints = [ f for f in exact_features if f.type.lower() == "{}_exact".format(ppr.name.lower()) ] ppr.genes = [get_closest_gene(f, ara_genes) for f in footprints] print "\tFound {} original genes, {}".format(len( ppr.genes), [g.qualifiers['gene'] for g in ppr.genes]) ppr.potentialHomologs = {} for i, plastid in enumerate(plastids): if plastid.name != "Alsophila spinulosa": continue print "\t\tSearch {}/{}".format(i + 1, len(plastids)) #search for homologs of each gene homologs = [] for gene in ppr.genes: g = SeqRecord(gene.extract(known_binding.seq).translate()) search = HMMER.jackhmmer(g, plastid) print "{} -> {} homologs".format(gene.qualifiers['gene'], len(search.matches)) homologs += search.getFeatures( type="{}_hl".format(gene.qualifiers['gene'])) #extract the sequence surrounding each homolog for h in homologs: h.location = FeatureLocation( max(0, h.location.start - 500), min(len(plastid), h.location.end + 500)) homologs = [SeqRecord(h.extract(plastid.seq)) for h in homologs] #find exact or close to exact binding domains for each and add to the #list of potential homologs for the PPR ph = [] for h in homologs: domains = [] for exact in ppr.exact: try: domains += binding.get_domains(exact, h, percentile=100.0, gaps=0) except KeyError: continue if domains: domains.sort(key=lambda d: -d.qualifiers['odds']) seq = str(domains[0].extract(h).seq) similarity = max([ sequence_similarity(original, seq) for original in ppr.footprints ]) print " {} -> \'{}\'".format(h.type, seq) ph.append((similarity, seq)) ph.sort(key=lambda p: -p[0]) ppr.potentialHomologs[plastid.name] = ph #try and avoid running out of RAM gc.collect() for ppr in pprs: print "\'{}\' footprints = {}".format(ppr.name, ppr.footprints) print "potential homologs" for key, value in ppr.potentialHomologs.iteritems(): print "{}: {}".format(key, value) return stats = [] for plastid in plastids: length = 0 similarity = 0.0 for ppr in pprs: length += len(ppr.potentialHomologs[plastid.name]) similarity += sum( [p[0] for p in ppr.potentialHomologs[plastid.name]]) try: stats.append({ 'name': plastid.name, 'avg_similarity': similarity / float(length), 'avg_homologs': length / len(pprs), }) except ZeroDivisionError: stats.append({ 'name': plastid.name, 'avg_similarity': 0.0, 'avg_homologs': 0, }) stats.sort(key=lambda s: -s['avg_similarity']) f = open("tmp", "w") for s in stats[0:50]: f.write("{name}, {avg_similarity}, {avg_homologs}\n".format(**s)) f.close()
def get_genomes(genome_id, genome_region, gene_ids, reverse_complement=True, entrez_mail='*****@*****.**', force=False): Entrez.email = entrez_mail chromosome, start, end = genome_region # NCBI uses 1 based indexing and closed intervals [a,b] handle = Entrez.efetch(db='nucleotide', id=genome_id, rettype='fasta', strand=1, seq_start=start + 1, seq_stop=end + 1 + 1) record = SeqIO.read(handle, 'fasta') hg19 = record.seq genomes = {} handle = Entrez.read( Entrez.esearch(db='nucleotide', term=' '.join(g[1] for g in gene_ids), retmode='xml')) for gi, gid in enumerate(handle['IdList']): params = {} if len(gene_ids[gi]) > 2: params = gene_ids[gi][2] genome = Entrez.efetch(db='nucleotide', id=gid, rettype='gb', retmode='text', **params).read() genome = SeqIO.read(StringIO(genome), 'genbank') if reverse_complement: genome.seq = genome.seq.reverse_complement() alignment = blat(hg19, genome.seq) log.trace('NCBI: Gene {} BLAT results: hit {}, query {}', genome.id, alignment.hit_range, alignment.query_range) translation = dict( (i[0], i[1] + start) for f in alignment for i in zip(range(*f.query_range), range(*f.hit_range))) cds = [c for c in genome.features if c.type == 'CDS'] if len(cds) == 0: cds = [c for c in genome.features if c.type == 'misc_RNA'] for cd in cds: protein = '' if 'translation' in cd.qualifiers: protein = cd.qualifiers['translation'] if reverse_complement: exons = [ SeqFeature.FeatureLocation( len(genome.seq) - e.end, len(genome.seq) - e.start, 1) for e in cd.location.parts ] introns = [ SeqFeature.FeatureLocation(e2.end, e1.start, 1) for e1, e2 in zip(exons[:-1], exons[1:]) ] else: exons = [ SeqFeature.FeatureLocation(e.start, e.end, 1) for e in cd.location.parts ] introns = [ SeqFeature.FeatureLocation(e1.end, e2.start, 1) for e1, e2 in zip(exons[:-1], exons[1:]) ] genomes[cd.qualifiers['gene'][0]] = Gene( name=cd.qualifiers['gene'][0], protein=protein, introns=introns, exons=exons, seq=genome.seq, translation=translation, pseudo_mutations={}, pseudo_translation={}, special_regions={}) if len(gene_ids) > 1: g, p = gene_ids[0][0], gene_ids[1][0] p, pt = get_pseudo_mutations(genomes[g], genomes[p], force) genomes[g].pseudo_mutations.update(p) genomes[g].pseudo_translation.update(pt) return genomes, hg19