''' Generate a list of all pairwise comparisons of the exact matches ''' import sys import re from phage import Phage phage = Phage() try: f = sys.argv[1] except: sys.exit("Exact match file, probably phage.kmers.bacteria.rc.txt") bg = phage.completeBacteriaIDs() pg = phage.phageIDs() matches = {} for p in pg: matches[p] = {} for b in bg: matches[p][b] = 0 with open(f, 'r') as fin: for l in fin: p = l.strip().split("\t") m = re.findall('NC_\d+', l) if len(m) != 2: #sys.stderr.write("Error parsing two NC ids from " + l) continue
from phage import Phage phage = Phage() try: blastf = sys.argv[1] except: sys.exit(sys.argv[0] + "< blast file>") # read the fasta file of phages to get the lengths lens = phage.phageSequenceLengths() sys.stderr.write("Found " + str(len(lens)) + " sequences\n") # get the phage and bacteria so we can limit our data appropriately bacteriaG = set(phage.completeBacteriaIDs()) phageG = set(phage.phageIDs()) hits = {} for p in phageG: hits[p] = {} with open(blastf, 'r') as fin: for l in fin: p = l.strip().split("\t") e = float(p[10]) if e > 0.001: continue m = re.findall('(NC_\d+)', p[0]) if m == []: sys.stderr.write("WARNING: No phage found in " + p[0] + "\n") continue
from phage import Phage phage = Phage() try: f = sys.argv[1] except: sys.exit( sys.argv[0] + " <blast output file converted to NC/NC format. Probably phage.genomes.blastx" ) count = {} lens = phage.phageSequenceLengths() bctG = set(phage.completeBacteriaIDs()) phgG = set(phage.phageIDs()) for p in phgG: count[p] = {} sys.stderr.write("Reading " + f + "\n") with open(f, 'r') as bin: for l in bin: p = l.strip().split("\t") if p[0] not in phgG: continue if p[1] not in bctG: continue if p[1] not in count[p[0]]: count[p[0]][p[1]] = []
import sys from phage import Phage phage=Phage() try: f=sys.argv[1] except: sys.exit(sys.argv[0] + " <blast output file converted to NC/NC format. Probably phage.genomes.blastx") count={} lens=phage.phageSequenceLengths() bctG = set(phage.completeBacteriaIDs()) phgG = set(phage.phageIDs()) for p in phgG: count[p]={} sys.stderr.write("Reading " + f + "\n") with open(f, 'r') as bin: for l in bin: p=l.strip().split("\t") if p[0] not in phgG: continue if p[1] not in bctG: continue if p[1] not in count[p[0]]: count[p[0]][p[1]]=[]
''' import sys import re from phage import Phage phage = Phage() try: f = sys.argv[1] except: sys.exit("Exact match file, probably phage.kmers.bacteria.rc.txt") bg = phage.completeBacteriaIDs() pg = phage.phageIDs() matches={} for p in pg: matches[p]={} for b in bg: matches[p][b] = 0 with open(f, 'r') as fin: for l in fin: p=l.strip().split("\t") m=re.findall('NC_\d+', l) if len(m) != 2: #sys.stderr.write("Error parsing two NC ids from " + l) continue
sys.stderr.write("No length for " + pnc + "\n") continue m = re.findall('(NC_\d+)', p[1]) if m == []: sys.stderr.write("WARNING: No bacteria found in " + p[1] + "\n") continue bnc = m[0] if bnc not in hits[pnc]: hits[pnc][bnc] = [] for i in xrange(lens[pnc] + 1): hits[pnc][bnc].append(0) for i in range(int(p[6]), int(p[7]) + 1): hits[pnc][bnc][i] = 1 # now print the table of phage and bacteria bacteria = phage.completeBacteriaIDs() phages = phage.phageIDs() print "Bacteria\t" + "\t".join(phages) for b in bacteria: print b for p in phages: if hits[p][b]: print "\t" + str(1.0 * sum(hits[p][b]) / lens[b]) else: print "\t0" print
import sys from phage import Phage phage=Phage() try: f=sys.argv[1] except: sys.exit(sys.argv[0] + " <blast output file converted to NC/NC format. Probably phage.genomes.blastx") count={} bctG = phage.completeBacteriaIDs() phgG = phage.phageIDs() for p in phgG: count[p]={} for b in bctG: count[p][b]=0 with open(f, 'r') as bin: for l in bin: p=l.strip().split("\t") if p[0] in count and p[1] in count[p[0]]: count[p[0]][p[1]] = count[p[0]].get(p[1], 0) + 1 for p in count: for b in count[p]: print("\t".join([p, b, str(count[p][b])]))
phage=Phage() try: blastf=sys.argv[1] except: sys.exit(sys.argv[0] + "< blast file>") # read the fasta file of phages to get the lengths lens=phage.phageSequenceLengths() sys.stderr.write("Found " + str(len(lens)) + " sequences\n") # get the phage and bacteria so we can limit our data appropriately bacteriaG = set(phage.completeBacteriaIDs()) phageG = set(phage.phageIDs()) hits = {} for p in phageG: hits[p]={} with open(blastf, 'r') as fin: for l in fin: p=l.strip().split("\t") e=float(p[10]) if e > 0.001: continue m=re.findall('(NC_\d+)', p[0]) if m == []: sys.stderr.write("WARNING: No phage found in " + p[0] + "\n") continue