def make_review_sheets(previous_urvdb_filename): oldaccs=getaccsfa(previous_urvdb_filename) newaccs=getaccs(wdir+'\\'+'RVDBv'+currentvs+'_accsOK.txt') d1=set(oldaccs).difference(set(newaccs)) d2=set(newaccs).difference(set(oldaccs)) i1=set(oldaccs).intersection(set(newaccs)) d1out=[] inf=open(previous_urvdb_filename) print "writing out headers for entries present in: "+previous_urvdb_filename+" but not in update v"+currentvs for line in inf: if line.startswith('>acc'): sl=line.strip().split('|') acc=sl[2] if acc in d1: d1out.append(sl[1:]) inf.close() outf=open(wdir+'\\'+'RVDBv'+currentvs+'.missing.csv','wb') import csv writer=csv.writer(outf) d1out.insert(0,['SOURCE','ACCESSION','DESCRIPTION']) writer.writerows(d1out) outf.close() d2out=[] inf=open(refseqdir+'\\'+'viral.genomic.eukviral.fasta') match=False for line in inf: if line.startswith('>acc'): sl=line.strip().split('|') acc=sl[2] if acc in d2: d2out.append(sl[1:]) inf.close() postags=['OK','VRL'] negtags=['FLAG','AMB','headers'] gbfns=getfns(gb_negkwdir,postags,negtags) tpafns=getfns(tpa_negkwdir,postags,negtags) readfns=[] readfns.extend(list(set(gbfns))) readfns.extend(list(set(tpafns))) print "writing out headers for entries present in update v"+currentvs+" that were not present in the previous version, "+previous_urvdb_filename for fn in readfns: print fn inf=open(fn) for line in inf: if line.startswith('>acc'): sl=line.strip().split('|') acc=sl[2] if acc in d2: d2out.append(sl[1:]) inf.close() outf=open(wdir+'\\'+'RVDBv'+currentvs+'.new.csv','wb') writer=csv.writer(outf) d2out.insert(0,['SOURCE','ACCESSION','DESCRIPTION']) writer.writerows(d2out) outf.close()
def write_update_accs_outfile(postags,negtags,accstype): print "writing out + accessions, those that are RefSeq eukaryotic or pass SEM-R_PIPE screen" outf=open(wdir+'\\'+'RVDBv'+currentvs+'_accs'+accstype+'.txt','w') if accstype=='OK': refseq_accs=getaccs(refseqdir+'\\'+'viral.genomic.eukviral.accs.txt') outf.write('\n'.join(refseq_accs)+'\n') print "finished collecting RefSeq Viral accessions" gbfns=getfns(gb_negkwdir,postags,negtags) for gbfn in gbfns: print gbfn gbaccsfa=getaccsfa(gbfn) gbaccs=[] for gbacc in gbaccsfa: if not gbacc.split('.')[0] in dupaccs: gbaccs.append(gbacc) outf.write('\n'.join(list(gbaccs))+'\n') print "finished collecting GenBank+ accessions" tpafns=getfns(tpa_negkwdir,postags,negtags) for tpafn in tpafns: tpaaccs=getaccsfa(tpafn) outf.write('\n'.join(list(tpaaccs))+'\n') print "finished collecting TPA+ accessions" outf.close()
gbdir = wdir + '\\' + 'GenBank_raw_data_' + datetag refseqdir = wdir + '\\' + 'RefSeq_raw_data_' + datetag tpadir = wdir + '\\' + 'TPA_raw_data_' + datetag gb_negkwdir = gbdir + '\\' + 'negkw_out_' + datetag tpa_negkwdir = tpadir + '\\' + 'negkw_out_' + datetag dupaccsfn = refseqdir + '\\' + 'refseq_viral_originalaccs.txt' postags = ['OK', 'VRL'] negtags = ['FLAG', 'AMB', 'headers'] gbfns = getfns(gb_negkwdir, postags, negtags) tpafns = getfns(tpa_negkwdir, postags, negtags) allfns = [] allfns.append(refseqdir + '\\' + 'viral.genomic.eukviral.fasta') allfns.extend(gbfns) allfns.extend(tpafns) removeaccs = getaccs(wdir + '\\' + removeaccsfn) dupaccs = getaccs(dupaccsfn) outf = open(wdir + '\\' + 'U-RVDBv' + currentvs + '.fasta', 'w') c = 0 match = False written = set([]) for fn in allfns: inf = open(fn) for line in inf: if line.startswith('>acc'): acc = line.split('|')[2].split('.')[0] if acc in removeaccs or acc in dupaccs or acc in written: match = False else: match = True written.add(acc)
from sequence_record_functions_PIPE import get_filenames as getfns homedir=sys.argv[1] datetag=sys.argv[2] currentvs=sys.argv[3] previous_urvdb_filename=sys.argv[4] ##homedir='E:' ##datetag='apr.2018' ##currentvs='13.0' wdir=homedir+'\\'+'RVDBv'+currentvs refseqdir=wdir+'\\'+'RefSeq_raw_data_'+datetag gbdir=wdir+'\\'+'GenBank_raw_data_'+datetag tpadir=wdir+'\\'+'TPA_raw_data_'+datetag gb_negkwdir=gbdir+'\\'+'negkw_out_'+datetag tpa_negkwdir=tpadir+'\\'+'negkw_out_'+datetag dupaccsfn=refseqdir+'\\'+'refseq_viral_originalaccs.txt' dupaccs=getaccs(dupaccsfn) def write_update_accs_outfile(postags,negtags,accstype): print "writing out + accessions, those that are RefSeq eukaryotic or pass SEM-R_PIPE screen" outf=open(wdir+'\\'+'RVDBv'+currentvs+'_accs'+accstype+'.txt','w') if accstype=='OK': refseq_accs=getaccs(refseqdir+'\\'+'viral.genomic.eukviral.accs.txt') outf.write('\n'.join(refseq_accs)+'\n') print "finished collecting RefSeq Viral accessions" gbfns=getfns(gb_negkwdir,postags,negtags) for gbfn in gbfns: print gbfn gbaccsfa=getaccsfa(gbfn) gbaccs=[] for gbacc in gbaccsfa:
for i, line in enumerate(inf): if line.startswith('>gi'): gi = line.strip().split('|')[1] headers[gi] = line inf.close() return headers ################################################################################################## ########## Execute command block ########### ################################################################################################## import sys homedir = sys.argv[1] datetag = sys.argv[2] currentvs = sys.argv[3] wdir = homedir + '\\' + 'RVDBv' + currentvs rvdb_filename = sys.argv[4] print rvdb_filename filterset_filename = sys.argv[5] from sequence_record_functions_PIPE import get_accs_flatfile as getaccs try: filterset = getaccs(filterset_filename) tofilter = True except IOError: filterset = set([]) tofilter = False cdict = characterize_by_biological_category(wdir, rvdb_filename, tofilter, filterset) write_characterization_output(wdir, rvdb_filename, cdict)
nmapfilename = refseqdir + '\\' + fn break refseq_gbfilenames = [] for fn in os.listdir(refseqdir): if fn.startswith('viral.genomic_'): refseq_gbfilenames.append(refseqdir + '\\' + fn) neighbors_dict = get_refseq_viral_neighbors_dict(nmapfilename) neighbors = colldict(neighbors_dict) comments_dict = dict() for refseq_gbfilename in refseq_gbfilenames: comments_dict = get_gb_comments(comments_dict, refseq_gbfilename) dups = make_duplicates_dict(neighbors_dict, comments_dict) dups2 = extend_duplicates_dict(comments_dict) dups3 = dict() import sys rvdb_rs_accs = getaccs(refseqdir + '\\' + 'viral.genomic.eukviral.accs.txt') rvdb_rs_accs = [rs_acc.split('.')[0] for rs_acc in rvdb_rs_accs] logf.write(str(len(rvdb_rs_accs)) + ' refseq viral accessions' + '\n') logf.write(str(len(neighbors)) + ' neighbor accessions' + '\n') logf.write( str(len(dups)) + ' refseq viral accessions mapped to original entries using neighbors annotation' + '\n') logf.write( str(len(dups2)) + ' refseq viral accessions mapped to original entries using GenBank metadata (comments section)' + '\n') logf.write( str(len(set(dups.keys()).intersection(set(dups2.keys())))) + ' refseq viral accessions in common between the two above' + '\n') for rs_acc in rvdb_rs_accs: