Пример #1
0
def make_review_sheets(previous_urvdb_filename):
    oldaccs=getaccsfa(previous_urvdb_filename)
    newaccs=getaccs(wdir+'\\'+'RVDBv'+currentvs+'_accsOK.txt')
    d1=set(oldaccs).difference(set(newaccs))
    d2=set(newaccs).difference(set(oldaccs))
    i1=set(oldaccs).intersection(set(newaccs))
    d1out=[]
    inf=open(previous_urvdb_filename)
    print "writing out headers for entries present in: "+previous_urvdb_filename+" but not in update v"+currentvs
    for line in inf:
        if line.startswith('>acc'):
            sl=line.strip().split('|')
            acc=sl[2]
            if acc in d1:
                d1out.append(sl[1:])
    inf.close()
    outf=open(wdir+'\\'+'RVDBv'+currentvs+'.missing.csv','wb')
    import csv
    writer=csv.writer(outf)
    d1out.insert(0,['SOURCE','ACCESSION','DESCRIPTION'])
    writer.writerows(d1out)
    outf.close()
    d2out=[]
    inf=open(refseqdir+'\\'+'viral.genomic.eukviral.fasta')
    match=False
    for line in inf:
        if line.startswith('>acc'):
            sl=line.strip().split('|')
            acc=sl[2]
            if acc in d2:
                d2out.append(sl[1:])
    inf.close()
    postags=['OK','VRL']
    negtags=['FLAG','AMB','headers']
    gbfns=getfns(gb_negkwdir,postags,negtags)
    tpafns=getfns(tpa_negkwdir,postags,negtags)
    readfns=[]
    readfns.extend(list(set(gbfns)))
    readfns.extend(list(set(tpafns)))
    print "writing out headers for entries present in update v"+currentvs+" that were not present in the previous version, "+previous_urvdb_filename
    for fn in readfns:
        print fn
        inf=open(fn)
        for line in inf:
            if line.startswith('>acc'):
                sl=line.strip().split('|')
                acc=sl[2]
                if acc in d2:
                    d2out.append(sl[1:])
        inf.close()
    outf=open(wdir+'\\'+'RVDBv'+currentvs+'.new.csv','wb')
    writer=csv.writer(outf)
    d2out.insert(0,['SOURCE','ACCESSION','DESCRIPTION'])
    writer.writerows(d2out)
    outf.close()
Пример #2
0
def write_update_accs_outfile(postags,negtags,accstype):
    print "writing out + accessions, those that are RefSeq eukaryotic or pass SEM-R_PIPE screen"
    outf=open(wdir+'\\'+'RVDBv'+currentvs+'_accs'+accstype+'.txt','w')
    if accstype=='OK':
        refseq_accs=getaccs(refseqdir+'\\'+'viral.genomic.eukviral.accs.txt')
        outf.write('\n'.join(refseq_accs)+'\n')
    print "finished collecting RefSeq Viral accessions"
    gbfns=getfns(gb_negkwdir,postags,negtags)
    for gbfn in gbfns:
        print gbfn
        gbaccsfa=getaccsfa(gbfn)
        gbaccs=[]
        for gbacc in gbaccsfa:
            if not gbacc.split('.')[0] in dupaccs:
                gbaccs.append(gbacc)
        outf.write('\n'.join(list(gbaccs))+'\n')
    print "finished collecting GenBank+ accessions"
    tpafns=getfns(tpa_negkwdir,postags,negtags)
    for tpafn in tpafns:
        tpaaccs=getaccsfa(tpafn)
        outf.write('\n'.join(list(tpaaccs))+'\n')
    print "finished collecting TPA+ accessions"
    outf.close()
Пример #3
0
gbdir = wdir + '\\' + 'GenBank_raw_data_' + datetag
refseqdir = wdir + '\\' + 'RefSeq_raw_data_' + datetag
tpadir = wdir + '\\' + 'TPA_raw_data_' + datetag
gb_negkwdir = gbdir + '\\' + 'negkw_out_' + datetag
tpa_negkwdir = tpadir + '\\' + 'negkw_out_' + datetag
dupaccsfn = refseqdir + '\\' + 'refseq_viral_originalaccs.txt'

postags = ['OK', 'VRL']
negtags = ['FLAG', 'AMB', 'headers']
gbfns = getfns(gb_negkwdir, postags, negtags)
tpafns = getfns(tpa_negkwdir, postags, negtags)
allfns = []
allfns.append(refseqdir + '\\' + 'viral.genomic.eukviral.fasta')
allfns.extend(gbfns)
allfns.extend(tpafns)
removeaccs = getaccs(wdir + '\\' + removeaccsfn)
dupaccs = getaccs(dupaccsfn)
outf = open(wdir + '\\' + 'U-RVDBv' + currentvs + '.fasta', 'w')
c = 0
match = False
written = set([])
for fn in allfns:
    inf = open(fn)
    for line in inf:
        if line.startswith('>acc'):
            acc = line.split('|')[2].split('.')[0]
            if acc in removeaccs or acc in dupaccs or acc in written:
                match = False
            else:
                match = True
                written.add(acc)
Пример #4
0
from sequence_record_functions_PIPE import get_filenames as getfns
homedir=sys.argv[1]
datetag=sys.argv[2]
currentvs=sys.argv[3]
previous_urvdb_filename=sys.argv[4]
##homedir='E:'
##datetag='apr.2018'
##currentvs='13.0'
wdir=homedir+'\\'+'RVDBv'+currentvs
refseqdir=wdir+'\\'+'RefSeq_raw_data_'+datetag
gbdir=wdir+'\\'+'GenBank_raw_data_'+datetag
tpadir=wdir+'\\'+'TPA_raw_data_'+datetag
gb_negkwdir=gbdir+'\\'+'negkw_out_'+datetag
tpa_negkwdir=tpadir+'\\'+'negkw_out_'+datetag
dupaccsfn=refseqdir+'\\'+'refseq_viral_originalaccs.txt'
dupaccs=getaccs(dupaccsfn)


def write_update_accs_outfile(postags,negtags,accstype):
    print "writing out + accessions, those that are RefSeq eukaryotic or pass SEM-R_PIPE screen"
    outf=open(wdir+'\\'+'RVDBv'+currentvs+'_accs'+accstype+'.txt','w')
    if accstype=='OK':
        refseq_accs=getaccs(refseqdir+'\\'+'viral.genomic.eukviral.accs.txt')
        outf.write('\n'.join(refseq_accs)+'\n')
    print "finished collecting RefSeq Viral accessions"
    gbfns=getfns(gb_negkwdir,postags,negtags)
    for gbfn in gbfns:
        print gbfn
        gbaccsfa=getaccsfa(gbfn)
        gbaccs=[]
        for gbacc in gbaccsfa:
Пример #5
0
    for i, line in enumerate(inf):
        if line.startswith('>gi'):
            gi = line.strip().split('|')[1]
            headers[gi] = line
    inf.close()
    return headers


##################################################################################################
########## Execute command block                                                       ###########
##################################################################################################

import sys
homedir = sys.argv[1]
datetag = sys.argv[2]
currentvs = sys.argv[3]
wdir = homedir + '\\' + 'RVDBv' + currentvs
rvdb_filename = sys.argv[4]
print rvdb_filename
filterset_filename = sys.argv[5]
from sequence_record_functions_PIPE import get_accs_flatfile as getaccs
try:
    filterset = getaccs(filterset_filename)
    tofilter = True
except IOError:
    filterset = set([])
    tofilter = False
cdict = characterize_by_biological_category(wdir, rvdb_filename, tofilter,
                                            filterset)
write_characterization_output(wdir, rvdb_filename, cdict)
Пример #6
0
        nmapfilename = refseqdir + '\\' + fn
        break
refseq_gbfilenames = []
for fn in os.listdir(refseqdir):
    if fn.startswith('viral.genomic_'):
        refseq_gbfilenames.append(refseqdir + '\\' + fn)
neighbors_dict = get_refseq_viral_neighbors_dict(nmapfilename)
neighbors = colldict(neighbors_dict)
comments_dict = dict()
for refseq_gbfilename in refseq_gbfilenames:
    comments_dict = get_gb_comments(comments_dict, refseq_gbfilename)
dups = make_duplicates_dict(neighbors_dict, comments_dict)
dups2 = extend_duplicates_dict(comments_dict)
dups3 = dict()
import sys
rvdb_rs_accs = getaccs(refseqdir + '\\' + 'viral.genomic.eukviral.accs.txt')
rvdb_rs_accs = [rs_acc.split('.')[0] for rs_acc in rvdb_rs_accs]
logf.write(str(len(rvdb_rs_accs)) + ' refseq viral accessions' + '\n')
logf.write(str(len(neighbors)) + ' neighbor accessions' + '\n')
logf.write(
    str(len(dups)) +
    ' refseq viral accessions mapped to original entries using neighbors annotation'
    + '\n')
logf.write(
    str(len(dups2)) +
    ' refseq viral accessions mapped to original entries using GenBank metadata (comments section)'
    + '\n')
logf.write(
    str(len(set(dups.keys()).intersection(set(dups2.keys())))) +
    ' refseq viral accessions in common between the two above' + '\n')
for rs_acc in rvdb_rs_accs: