def alignmentTargets(genome_files, contig_files): ''' generator object to produce filenames for aligning contigs to known ncbi genomes ''' parameters = [] for genome, contig in itertools.product(genome_files, contig_files): outfile = os.path.join( "alignment.dir", P.snip(contig, ".contigs.fa") + "_vs_" + P.snip(os.path.basename(genome), ".fna")) + ".delta" additional_input = add_inputs(contig) parameters.append([outfile, genome, contig]) return parameters
def filterContigsByCoverage(infiles, outfile): ''' filter contigs by their average base coverage ''' fcoverage = PARAMS["coverage_filter"] contig_file = infiles[0] dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() for infile in infiles[1:]: print(contig_file, P.snip(os.path.basename(infile), ".load"))
def chimeraTargets(alignment_files, contig_files): ''' generator object to produce filenames for scoring chimericity ''' parameters = [] for alignment, contig in itertools.product(genome_files, contig_files): outfile = os.path.join("chimeras.dir", P.snip(alignment, ".bam") + ".chimeras") parameters.append([outfile, alignment, contig]) return parameters
def collectGenomeSizes(infile, outfile): ''' output the genome sizes for each genome ''' to_cluster = True outf = open(outfile, "w") outf.write("genome\tlength\n") # assume single fasta entry for fasta in FastaIterator.iterate(iotools.openFile(infile)): name = P.snip(os.path.basename(infile), ".fna") length = len(list(fasta.sequence)) outf.write("%s\t%s\n" % (name, str(length))) outf.close()
def calculateFalsePositiveRate(infiles, outfile): ''' calculate the false positive rate in taxonomic abundances ''' # connect to database dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() true_file = infiles[0] true_set = set() estimate_set = set() for estimate_file in infiles[1:]: if os.path.basename(estimate_file)[ len("metaphlan_"):] == os.path.basename(true_file): tablenames = [ P.toTable(os.path.basename(true_file)), P.toTable(os.path.basename(estimate_file)) ] for species in cc.execute("""SELECT species_name FROM %s""" % tablenames[0]).fetchall(): true_set.add(species[0]) for species in cc.execute( """SELECT taxon FROM %s WHERE taxon_level == 'species'""" % tablenames[1]).fetchall(): if species[0].find("_unclassified") != -1: continue estimate_set.add(species[0]) total_estimate = len(estimate_set) total_true = len(true_set) E.info("counting false positives and false negatives") print(estimate_set.difference(true_set)) nfp = len(estimate_set.difference(true_set)) nfn = len(true_set.difference(estimate_set)) ntp = len(estimate_set.intersection(true_set)) E.info("writing results") track = P.snip(os.path.basename(true_file), ".load") outf = open(outfile, "w") outf.write("track\ttp_rate\tfp_rate\tfn_rate\n") outf.write("\t".join( map(str, [ track, float(ntp) / total_estimate, float(nfp) / total_estimate, float(nfn) / total_true ])) + "\n") outf.close()
def alignContigsToReference(outfile, param1, param2): ''' align the contigs to the reference genomes using nucmer ''' to_cluster = True reffile, contigfile = param1, param2 pattern = P.snip(os.path.basename(outfile), ".delta") statement = '''nucmer -p %(pattern)s %(reffile)s %(contigfile)s''' P.run() outf = os.path.basename(outfile) statement = '''mv %(outf)s alignment.dir''' P.run()
def buildAlignmentSizes(infiles, outfile): ''' use bed files to sum the total number of bases that are aligned to the genomes ''' outf = open(outfile, "w") outf.write("genome\tsize\n") for infile in infiles: genome = P.snip(os.path.basename(infile), ".bed.gz") c = 0 inf = iotools.openFile(infile) for bed in Bed.iterator(inf): c += bed.end - bed.start outf.write("%s\t%s\n" % (genome, str(c))) outf.close()
def plotRelativeAbundanceCorrelations(infiles, outfile): ''' plot the correlation between the estimated relative abundance of species and the true relative abundances - done on the shared set ''' # connect to database dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() true_file = infiles[0] temp = P.getTempFile() temp.write("true\testimate\n") for estimate_file in infiles[1:]: if os.path.basename(estimate_file)[ len("metaphlan_"):] == os.path.basename(true_file): tablenames = [ P.toTable(os.path.basename(true_file)), P.toTable(os.path.basename(estimate_file)) ] # get data statement = """SELECT a.relab, b.rel_abundance FROM %s as a, %s as b WHERE b.taxon_level == "species" AND a.species_name == b.taxon""" % (tablenames[0], tablenames[1]) for data in cc.execute(statement).fetchall(): true, estimate = data[0], data[1] temp.write("%f\t%f\n" % (true, estimate)) temp.close() print(temp.name) inf = temp.name R('''data <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' % inf) R('''png("%s")''' % outfile) main_name = P.snip(outfile, ".png") R('''data$estimate <- data$estimate/100''') R('''plot(data$estimate, data$true, pch = 16, main = "%s", xlab = "estimated relative abundance", ylab = "observed relative abundance")''' % main_name) R('''text(0.05, y = 0.35, labels = paste("r = ", round(cor(data$estimate, data$true),2)), cex = 2)''' ) R["dev.off"]() os.unlink(inf)