def __call__(self, track, slice=None): fn = os.path.join( DATADIR, "replicated_intervals/%(track)s.peakshape.gz.matrix_%(slice)s.gz" % locals()) if not os.path.exists(fn): return x = IOTools.openFile(fn) matrix, rownames, colnames = IOTools.readMatrix(x) nrows = len(rownames) if nrows == 0: return if nrows > self.scale: take = numpy.array(numpy.floor( numpy.arange(0, nrows, float(nrows + 1) / self.scale)), dtype=int) rownames = [rownames[x] for x in take] matrix = matrix[take] return odict( (('matrix', matrix), ('rows', rownames), ('columns', colnames)))
def buildPFAMDomains( infiles, outfile ): '''map PFAM domains onto current sequence collection. The mapping is done by ID lookup.''' infile = infiles[0] with IOTools.openFile( "nrdb50.fasta.tsv") as inf: reader = csv.DictReader( inf, dialect='excel-tab' ) map_id2nid = {} for row in reader: map_id2nid[row['repid']] = row['nid'] rx = re.compile( "(\S+)\/(\d+)-(\d+)\s+(\S+);(.*);" ) c = E.Counter() outf = IOTools.openFile( outfile, "w" ) with IOTools.openFile( infile ) as inf: for entry in FastaIterator.iterate( inf ): c.input += 1 pid, start, end, pfam_id, description = rx.match( entry.title ).groups() try: outf.write( "%s\t%i\t%i\t%s\n" % (map_id2nid[pid], int(start)-1, int(end), pfam_id ) ) except KeyError: c.missed += 1 continue c.output += 1 outf.close() E.info( c )
def __call__(self, track, slice=None): fn = "ortholog_pairs_with_feature.matrix2" if not os.path.exists(fn): return x = IOTools.openFile(fn) matrix, rownames, colnames = IOTools.readMatrix(x) return odict((("matrix", matrix), ("rows", rownames), ("columns", colnames)))
def __call__(self, track, slice=None): fn = "ortholog_pairs_with_feature.matrix2" if not os.path.exists(fn): return x = IOTools.openFile(fn) matrix, rownames, colnames = IOTools.readMatrix(x) return odict( (('matrix', matrix), ('rows', rownames), ('columns', colnames)))
def __call__(self, track, slice = None): c_transcript = [] c_gene = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_transcript.append(len(transcript)) for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): c_gene.append(len(gene)) return odict( ( ("transcript", np.mean(c_transcript)), ("gene",np.mean(c_gene) )) )
def configToDictionary( config ): p = {} for section in config.sections(): for key,value in config.items( section ): v = IOTools.convertValue( value ) p["%s_%s" % (section,key)] = v if section in ( "general", "DEFAULT" ): p["%s" % (key)] = v for key, value in config.defaults().iteritems(): p["%s" % (key)] = IOTools.convertValue( value ) return p
def __call__(self, track, slice = None): if slice == "transcript": lengths_transcripts = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in transcript]) lengths_transcripts.append(length) return np.mean(lengths_transcripts) elif slice == "gene": lengths_genes = [] for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in gene]) lengths_genes.append(length) return np.mean(lengths_genes)
def buildSummaryCalledDMRs( infiles, outfile ): '''build summary of differentially methylated regions.''' dbh = connect() cc = dbh.cursor() outf = IOTools.openFile( outfile, "w" ) outf.write( "metatrack\ttest\tntested\tnok\tnsignificant\tn2fold\n" ) for track in TRACKS: tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master WHERE type='table' and sql LIKE '%%control_mean%%' and sql LIKE '%%treatment_mean%%'""" % track ).fetchall()] for table in tables: statement = """SELECT COUNT(*) as ntested, SUM(CASE WHEN status='OK' THEN 1 ELSE 0 END) AS nok, SUM(CASE WHEN significant THEN 1 ELSE 0 END) AS nsignificant, SUM(CASE WHEN significant AND (l2fold < -1 OR l2fold > 1) THEN 1 ELSE 0 END) as n2fold FROM medip_%(track)s.%(table)s""" ntested, nok, nsignificant, n2fold = cc.execute( statement % locals() ).fetchone() outf.write( "\t".join( map(str, (track, table, ntested, nok, nsignificant, n2fold )))+ "\n" ) outf.close()
def buildSummaryMapping( infiles, outfile ): dbh = connect() cc = dbh.cursor() outf = IOTools.openFile( outfile, "w" ) table = "bam_stats" colnames = None for track in TRACKS: statement = """SELECT * FROM medip_%(track)s.%(table)s""" data = cc.execute( statement % locals() ).fetchall() _colnames = [x[0] for x in cc.description] if not colnames: colnames = _colnames outf.write( "\t".join( ["metatrack"] + colnames,) + "\n" ) assert colnames == _colnames for row in data: outf.write( "\t".join( map(str, (track,) + row))+ "\n" ) outf.close()
def getParameters(filename="pipeline.ini"): '''read a config file and return as a dictionary. Sections and keys are combined with an underscore. If a key without section does not exist, it will be added plain. For example:: [general] input=input1.file [special] input=input2.file will be entered as { 'general_input' : "input1.file", 'input: "input1.file", 'special_input' : "input2.file" } This function also updates the module-wide parameter map. ''' p = {} config = ConfigParser.ConfigParser() config.readfp(open(filename), "r") for section in config.sections(): for key, value in config.items(section): v = IOTools.convertValue(value) if key not in p: p[key] = v p["%s_%s" % (section, key)] = v PARAMS.update(p) return p
def buildSummaryCpGCoverage(infiles, outfile): '''build summary of differentially methylated regions.''' dbh = connect() cc = dbh.cursor() outf = IOTools.openFile(outfile, "w") outf.write("metatrack\ttrack\tcoverage\tncovered\tpcovered\n") for track in TRACKS: tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master WHERE type='table' and name LIKE '%%coveredpos%%' """ % track ).fetchall()] for table in tables: statement = """SELECT '%(track)s' as metatrack, '%(table)s' as track, coverage, ncovered, pcovered FROM medip_%(track)s.%(table)s""" for x in cc.execute(statement % locals()): outf.write("\t".join(map(str, x)) + "\n") outf.close()
def buildSummaryCalledDMRs(infiles, outfile): '''build summary of differentially methylated regions.''' dbh = connect() cc = dbh.cursor() outf = IOTools.openFile(outfile, "w") outf.write("metatrack\ttest\tntested\tnok\tnsignificant\tn2fold\n") for track in TRACKS: tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master WHERE type='table' and sql LIKE '%%control_mean%%' and sql LIKE '%%treatment_mean%%'""" % track ).fetchall()] for table in tables: statement = """SELECT COUNT(*) as ntested, SUM(CASE WHEN status='OK' THEN 1 ELSE 0 END) AS nok, SUM(CASE WHEN significant THEN 1 ELSE 0 END) AS nsignificant, SUM(CASE WHEN significant AND (l2fold < -1 OR l2fold > 1) THEN 1 ELSE 0 END) as n2fold FROM medip_%(track)s.%(table)s""" ntested, nok, nsignificant, n2fold = cc.execute( statement % locals()).fetchone() outf.write( "\t".join(map(str, (track, table, ntested, nok, nsignificant, n2fold))) + "\n") outf.close()
def buildSummaryMapping(infiles, outfile): dbh = connect() cc = dbh.cursor() outf = IOTools.openFile(outfile, "w") table = "bam_stats" colnames = None for track in TRACKS: statement = """SELECT * FROM medip_%(track)s.%(table)s""" data = cc.execute(statement % locals()).fetchall() _colnames = [x[0] for x in cc.description] if not colnames: colnames = _colnames outf.write("\t".join(["metatrack"] + colnames,) + "\n") assert colnames == _colnames for row in data: outf.write("\t".join(map(str, (track,) + row)) + "\n") outf.close()
def buildSummaryCpGCoverage( infiles, outfile ): '''build summary of differentially methylated regions.''' dbh = connect() cc = dbh.cursor() outf = IOTools.openFile( outfile, "w" ) outf.write("metatrack\ttrack\tcoverage\tncovered\tpcovered\n" ) for track in TRACKS: tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master WHERE type='table' and name LIKE '%%coveredpos%%' """ % track ).fetchall()] for table in tables: statement = """SELECT '%(track)s' as metatrack, '%(table)s' as track, coverage, ncovered, pcovered FROM medip_%(track)s.%(table)s""" for x in cc.execute(statement % locals()): outf.write( "\t".join(map(str,x))+ "\n" ) outf.close()
def getParameters( filename = "pipeline.ini" ): '''read a config file and return as a dictionary. Sections and keys are combined with an underscore. If a key without section does not exist, it will be added plain. For example:: [general] input=input1.file [special] input=input2.file will be entered as { 'general_input' : "input1.file", 'input: "input1.file", 'special_input' : "input2.file" } This function also updates the module-wide parameter map. ''' p = {} config = ConfigParser.ConfigParser() config.readfp(open(filename),"r") for section in config.sections(): for key,value in config.items( section ): v = IOTools.convertValue( value ) if key not in p: p[key] = v p["%s_%s" % (section,key)] = v PARAMS.update( p ) return p
def __call__(self, track, slice = None): classes = ["antisense" , "antisense_upstream" , "antisense_downstream" , "sense_upstream" , "sense_downstream" , "intergenic" , "sense_intronic" , "antisense_intronic"] coding_set = {} for gtf in GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")): coding_set[gtf.transcript_id] = gtf.source result = {"noncoding": {}, "coding":collections.defaultdict(int)} total_nc = float(self.getValue("SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'")) for c in classes: result["noncoding"][c] = (float(self.getValue("""SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' AND b.C_NC = 'noncoding' AND a.transcript_id = b.transcript_id""" % (track,c)))/total_nc)*100 total_c = len(coding_set.keys()) for c in classes: ids = self.getValues("SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'") for i in ids: if i in coding_set.keys(): if coding_set[i] == c: result["coding"][c] += 1 for x, y in result["coding"].iteritems(): result["coding"][x] = (float(y)/total_c)*100 return result
def __call__(self, track, slice = None): fn = os.path.join( DATADIR, "%(track)s.peakshape.tsv.gz.matrix_%(slice)s.gz" % locals() ) if not os.path.exists( fn ): return matrix, rownames, colnames = IOTools.readMatrix( IOTools.openFile( fn )) nrows = len(rownames) if nrows == 0: return if nrows > 1000: take = numpy.array( numpy.floor( numpy.arange( 0, nrows, nrows / 1000 ) ), dtype = int ) rownames = [ rownames[x] for x in take ] matrix = matrix[ take ] return odict( (('matrix', matrix), ('rows', rownames), ('columns', colnames)) )
def getReferenceLincRNA(self, reference_gtf): lincs = [] for entry in GTF.iterator(IOTools.openFile(reference_gtf)): if entry.source == "lincRNA": if entry.gene_id not in lincs: lincs.append(entry.gene_id) return len(lincs)
def __call__(self,track, slice = None): transcript_counts = collections.defaultdict( set ) counts = [] for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))): transcript_counts[gtf.gene_id].add(gtf.transcript_id) for gene, transcripts in transcript_counts.iteritems(): counts.append(len(transcripts)) return counts
def checkBlastRuns( infiles, outfile ): '''check if output files are complete. ''' outf = IOTools.openFile( outfile, "w" ) outf.write( "chunkid\tquery_first\tquery_last\tfound_first\tfound_last\tfound_total\tfound_results\thas_finished\tattempts\t%s\n" %\ "\t".join(Logfile.RuntimeInformation._fields)) for infile in infiles: E.debug( "processing %s" % infile) chunkid = P.snip( os.path.basename( infile ), ".blast.gz" ) logfile = infile + ".log" chunkfile = P.snip( infile, ".blast.gz" ) + ".fasta" with IOTools.openFile( infile ) as inf: l = inf.readline() ids = set() total_results = 0 for l in inf: if l.startswith("#//"): continue ids.add( int(l.split("\t")[0] ) ) total_results += 1 found_first = min(ids) found_last = max(ids) found_total = len(ids) l = IOTools.getFirstLine( chunkfile ) query_first = l[1:-1] l2 = IOTools.getLastLine( chunkfile, nlines = 2).split("\n") query_last = l2[0][1:] logresults = Logfile.parse( logfile ) outf.write( "\t".join( map(str, (\ chunkid, query_first, query_last, found_first, found_last, found_total, total_results, logresults[-1].has_finished, len(logresults), "\t".join( map(str, logresults[-1]) ) ) ) ) + "\n" ) outf.close()
def __call__(self, track, slice = None): pattern = self.pattern fn = os.path.join( DATADIR, "liver_vs_testes/%(track)s%(pattern)s.matrix_%(slice)s.gz" % locals() ) if not os.path.exists( fn ): return x = IOTools.openFile( fn ) matrix, rownames, colnames = IOTools.readMatrix( x ) nrows = len(rownames) if nrows == 0: return if nrows > self.scale: take = numpy.array( numpy.floor( numpy.arange( 0, nrows, float(nrows + 1) / self.scale ) ), dtype = int ) rownames = [ rownames[x] for x in take ] matrix = matrix[ take ] return odict( (('matrix', matrix), ('rows', rownames), ('columns', colnames)) )
def __call__(self, track, slice = None): if slice == "transcript": lengths_transcripts = [] for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in transcript]) lengths_transcripts.append(length) counts, lower, dx, _ = scipy.stats.cumfreq(lengths_transcripts, numbins=40, defaultreallimits=(0,20000)) x = np.arange(counts.size) * dx + lower return odict( (("length", x), ("cumulative frequency", counts/len(lengths_transcripts))) ) elif slice == "gene": lengths_genes = [] for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))): length = sum([gtf.end - gtf.start for gtf in gene]) lengths_genes.append(length) counts, lower, dx, _ = scipy.stats.cumfreq(lengths_genes, numbins=40, defaultreallimits=(0,20000)) x = np.arange(counts.size) * dx + lower return odict( (("length", x), ("cumulative frequency", counts/len(lengths_genes))) )
def __call__(self,track, slice = None): transcript_counts = collections.defaultdict( set ) counts = [] for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))): transcript_counts[gtf.gene_id].add(gtf.transcript_id) for gene, transcripts in transcript_counts.iteritems(): counts.append(len(transcripts)) count, lower, dx, _ = scipy.stats.cumfreq(counts, numbins=40, defaultreallimits=(1,15)) x = np.arange(count.size) * dx + lower return odict( (("transcript number", x), ("cumulative frequency", count/len(counts))) )
def buildNrdb50( infile, outfile ): '''build nrdb50 Renumber seqences.''' outf_fasta = IOTools.openFile( outfile, "w" ) outf_table = IOTools.openFile( outfile + ".tsv", "w" ) outf_table.write("nid\tpid\thid\tdescription\tcluster_size\ttaxon\trepid\n" ) rx = re.compile( "(\S+) (.*) n=(\d+) Tax=(.*) RepID=(\S+)" ) nid = 1 for entry in FastaIterator.iterate( IOTools.openFile( infile )): outf_fasta.write(">%i\n%s\n" % (nid, entry.sequence ) ) cluster_name, description, cluster_size, taxon, repid = rx.match( entry.title ).groups() hid = computeHID( entry.sequence ) outf_table.write( "\t".join( (str(nid), cluster_name, hid, description, cluster_size, taxon, repid)) + "\n" ) nid += 1 outf_fasta.close() outf_table.close()
def getNumColumns( filename ): '''return number of fields in bed-file by looking at the first entry. Returns 0 if file is empty. ''' with IOTools.openFile( filename ) as inf: for line in inf: if line.startswith("#"): continue if line.startswith("track"): continue return len(line[:-1].split("\t")) return 0
def testMCL(self): tab_path = Path('./tab.txt') ival = 30 pival = 18 df_chr, sample_list = LoadInput.loadToPandas('testdata2.hdf', 'testdata2.tsv', 'A', filter=False) sections = [[1, 2, 3, 4, 5, 6, 7, 8, 9]] pool = Pool(initializer=pc.mclInit, initargs=(sample_list, tab_path, ival, pival)) IOTools.writeTab(sample_list, 'tab.txt') clusters = pool.map( pc.mclWorker, [df_chr.loc[(slice(None), section), :] for section in sections], chunksize=10) expected = [[['A', 'B', 'C']]] assert clusters == expected
def getNumColumns(filename): '''return number of fields in bed-file by looking at the first entry. Returns 0 if file is empty. ''' with IOTools.openFile(filename) as inf: for line in inf: if line.startswith("#"): continue if line.startswith("track"): continue return len(line[:-1].split("\t")) return 0
def checkBlastRun( infiles, outfile ): '''build summary stats on file.''' pairsdbfile, seqfile = infiles nids = set() with IOTools.openFile( seqfile ) as inf: for r in FastaIterator.iterate( inf ): nids.add( int(r.title) ) with IOTools.openFile( pairsdbfile ) as inf: query_ids, sbjct_ids = set(), set() total_results, self_links = 0, 0 for l in inf: l = inf.readline() if l.startswith("#//"): continue query_id, sbjct_id = l.split("\t")[:2] query_ids.add( int(query_id) ) sbjct_ids.add( int(sbjct_id) ) if query_id == sbjct_id: self_links += 1 total_results += 1 outf = IOTools.openFile( outfile, "w" ) outf.write( "category\tcounts\n") outf.write( "\t".join( map(str, ('nids', len(nids)))) + "\n" ) outf.write( "\t".join( map(str, ('links', total_results))) + "\n" ) outf.write( "\t".join( map(str, ('self', self_links))) + "\n" ) outf.write( "\t".join( map(str, ('queries', len(query_ids)))) + "\n" ) outf.write( "\t".join( map(str, ('sbjcts', len(sbjct_ids)))) + "\n" ) outf.close() outf = IOTools.openFile( outfile + '.missing_queries.gz', 'w' ) outf.write( 'nid\n' ) outf.write( "\n".join( map(str, sorted( list( nids.difference( query_ids )) ) )) + "\n" ) outf.close() outf = IOTools.openFile( outfile + '.missing_sbjcts.gz', 'w' ) outf.write( 'nid\n' ) outf.write( "\n".join( map(str, sorted( list( nids.difference( sbjct_ids )) ) )) + "\n" ) outf.close()
def collectGenomeSizes(infile, outfile): ''' output the genome sizes for each genome ''' to_cluster = True outf = open(outfile, "w") outf.write("genome\tlength\n") # assume single fasta entry for fasta in FastaIterator.iterate(IOTools.openFile(infile)): name = P.snip(os.path.basename(infile), ".fna") length = len(list(fasta.sequence)) outf.write("%s\t%s\n" % (name, str(length))) outf.close()
def buildPFAMFamilies( infiles, outfile ): outf = IOTools.openFile( outfile, "w" ) outf.write( "family\tshort\tdescription\n" ) infile = infiles[1] family, description, short = None, None, None c = E.Counter() with IOTools.openFile( infile ) as inf: for line in inf: if line.startswith( "#=GF AC"): if family: outf.write( "%s\n" % "\t".join( (family,description,short))) c.output += 1 family = re.match("#=GF AC\s+(\S+)", line[:-1]).groups()[0] elif line.startswith( "#=GF DE"): description = re.match("#=GF DE\s+(.+)",line[:-1]).groups()[0] elif line.startswith( "#=GF ID"): short = re.match("#=GF ID\s+(.+)",line[:-1]).groups()[0] outf.write( "%s\n" % "\t".join( (family,description,short))) c.outptut += 1 outf.close() E.info(c)
def buildAlignmentSizes(infiles, outfile): ''' use bed files to sum the total number of bases that are aligned to the genomes ''' outf = open(outfile, "w") outf.write("genome\tsize\n") for infile in infiles: genome = P.snip(os.path.basename(infile), ".bed.gz") c = 0 inf = IOTools.openFile(infile) for bed in Bed.iterator(inf): c += bed.end - bed.start outf.write("%s\t%s\n" % (genome, str(c))) outf.close()
def removeBlastUnfinished( infiles, outfile ): '''remove aborted blast runs.''' deleted = 0 for infile in infiles: line = IOTools.getLastLine( infile ) if not re.search( "job finished", line ): fn = infile[:-len(".log")] if os.path.exists( fn ): P.info("deleting %s" % fn ) os.unlink( fn ) deleted += 1 P.info("deleted %i files" % deleted)
def buildMatrixFromTables( infiles, column, column_header = 0, dtype = numpy.float, default = None ): '''build a matrix from a column called *column* in a series of input files. If column_value == None, the first column is taken as the name of the row. The columns are given by order of the input files. returns matrix, row_headers ''' lists = [] for infile in infiles: data = pandas.read_table( IOTools.openFile(infile) ) lists.append( zip( list( data[column_header] ), list(data[column]) ) ) return buildMatrixFromLists( lists, dtype = dtype, default = default )
def __call__(self, track): length = {} for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.gtf.gz"))): length[transcript[0].transcript_id] = sum([gtf.end - gtf.start for gtf in transcript]) score = {} dbh = sqlite3.connect("csvdb") cc = dbh.cursor() for data in cc.execute("SELECT transcript_id, CP_score FROM lncrna_filtered_cpc_result"): score[data[0]] = data[1] result = {"length": [], "score": []} for transcript, value in length.iteritems(): result["length"].append(np.log10(length[transcript])) result["score"].append(score[transcript]) return result
def buildTrueTaxonomicRelativeAbundances(infile, outfile): ''' get species level relative abundances for the simulateds data. This involes creating maps between different identifiers from the NCBI taxonomy. This is so that the results are comparable to species level analysis from metaphlan The gi_taxid_nucl is a huge table and therefore this function takes an age to run - can think of optimising this somehow ''' to_cluster = True total = 0 rel_abundance = collections.defaultdict(int) for fastq in Fastq.iterate(IOTools.openFile(infile)): total += 1 gi = fastq.identifier.split("|")[1] rel_abundance[gi] += 1 for gi, ab in rel_abundance.items(): rel_abundance[gi] = float(ab) / total dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() result = collections.defaultdict(float) for gi in list(rel_abundance.keys()): E.info("processing gi %s" % gi) taxid = cc.execute( """SELECT taxid FROM gi_taxid_nucl WHERE gi == '%s'""" % gi).fetchone()[0] species_id = cc.execute( """SELECT species_id FROM categories WHERE taxid == '%s'""" % taxid).fetchone()[0] species_name = cc.execute( """SELECT taxname FROM names WHERE taxid == '%s' AND description == 'scientific name'""" % species_id).fetchone()[0] abundance = rel_abundance[gi] E.info("mapped gi %s to taxid: %s, species_id: %s, species_name: %s" % (str(gi), str(taxid), str(species_id), species_name)) result[species_name] += abundance outf = open(outfile, "w") outf.write("species_name\trelab\n") for species_name, abundance in result.items(): # create names consistent with metaphlan species_name = species_name.replace(" ", "_") outf.write("%s\t%f\n" % (species_name, abundance)) outf.close()
def buildMatrixFromTables(infiles, column, column_header=0, dtype=numpy.float, default=None): '''build a matrix from a column called *column* in a series of input files. If column_value == None, the first column is taken as the name of the row. The columns are given by order of the input files. returns matrix, row_headers ''' lists = [] for infile in infiles: data = pandas.read_table(IOTools.openFile(infile)) lists.append(zip(list(data[column_header]), list(data[column]))) return buildMatrixFromLists(lists, dtype=dtype, default=default)
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) coords_file=args[0] bamfile=pysam.Samfile( args[1], 'rb' ) # bamfile options.stdout.write( "gene_id\tcounts\tlength\n" ) iter = Bed.iterator( IOTools.openFile( coords_file ) ) for gene_id, exons in itertools.groupby( iter, lambda x: x.name ): num_reads=0 anames=set([]) lgene = 0 for bed in exons: lgene += bed.end - bed.start for alignedread in bamfile.fetch(bed.contig, bed.start, bed.end): anames.add((alignedread.qname, alignedread.is_read1)) num_reads = len(anames) options.stdout.write( "\t".join( (gene_id, str(num_reads), str(lgene ) )) + "\n" ) ## write footer and output benchmark information. E.Stop()
def iterator_sorted( gff_iterator, sort_order = "gene" ): '''sort input and yield sorted output.''' entries = list(gff_iterator) if sort_order == "gene": entries.sort( key = lambda x: (x.gene_id, x.transcript_id, x.contig, x.start) ) elif sort_order == "gene": entries.sort( key = lambda x: (x.gene_id, x.contig, x.start) ) elif sort_order == "contig+gene": entries.sort( key = lambda x: (x.contig,x.gene_id,x.transcript_id,x.start) ) elif sort_order == "transcript": entries.sort( key = lambda x: (x.transcript_id, x.contig, x.start) ) elif sort_order == "position": entries.sort( key = lambda x: (x.contig, x.start) ) elif sort_order == "position+gene": entries.sort( key = lambda x: (x.gene_id, x.start) ) genes = list( flat_gene_iterator(entries) ) genes.sort( key = lambda x: (x[0].contig, x[0].start) ) entries = IOTools.flatten( genes ) for entry in entries: yield entry
def buildTrueTaxonomicRelativeAbundances(infile, outfile): ''' get species level relative abundances for the simulateds data. This involes creating maps between different identifiers from the NCBI taxonomy. This is so that the results are comparable to species level analysis from metaphlan The gi_taxid_nucl is a huge table and therefore this function takes an age to run - can think of optimising this somehow ''' to_cluster = True total = 0 rel_abundance = collections.defaultdict(int) for fastq in Fastq.iterate(IOTools.openFile(infile)): total += 1 gi = fastq.identifier.split("|")[1] rel_abundance[gi] += 1 for gi, ab in rel_abundance.iteritems(): rel_abundance[gi] = float(ab)/total dbh = sqlite3.connect(PARAMS["database"]) cc = dbh.cursor() result = collections.defaultdict(float) for gi in rel_abundance.keys(): E.info("processing gi %s" % gi) taxid = cc.execute("""SELECT taxid FROM gi_taxid_nucl WHERE gi == '%s'""" % gi).fetchone()[0] species_id = cc.execute("""SELECT species_id FROM categories WHERE taxid == '%s'""" % taxid).fetchone()[0] species_name = cc.execute("""SELECT taxname FROM names WHERE taxid == '%s' AND description == 'scientific name'""" % species_id).fetchone()[0] abundance = rel_abundance[gi] E.info("mapped gi %s to taxid: %s, species_id: %s, species_name: %s" % (str(gi), str(taxid), str(species_id), species_name)) result[species_name] += abundance outf = open(outfile, "w") outf.write("species_name\trelab\n") for species_name, abundance in result.iteritems(): # create names consistent with metaphlan species_name = species_name.replace(" ", "_") outf.write("%s\t%f\n" % (species_name, abundance)) outf.close()
def iterator_sorted(gff_iterator, sort_order="gene"): '''sort input and yield sorted output.''' entries = list(gff_iterator) if sort_order == "gene": entries.sort( key=lambda x: (x.gene_id, x.transcript_id, x.contig, x.start)) elif sort_order == "gene": entries.sort(key=lambda x: (x.gene_id, x.contig, x.start)) elif sort_order == "contig+gene": entries.sort( key=lambda x: (x.contig, x.gene_id, x.transcript_id, x.start)) elif sort_order == "transcript": entries.sort(key=lambda x: (x.transcript_id, x.contig, x.start)) elif sort_order == "position": entries.sort(key=lambda x: (x.contig, x.start)) elif sort_order == "position+gene": entries.sort(key=lambda x: (x.gene_id, x.start)) genes = list(flat_gene_iterator(entries)) genes.sort(key=lambda x: (x[0].contig, x[0].start)) entries = IOTools.flatten(genes) for entry in entries: yield entry
####################################################################### ## retrieve structure if options.filename_pdb: infile = open(options.filename_pdb, "r") pdb_lines = infile.readlines() infile.close() else: pdb_lines = os.popen(param_retrieval_command % string.lower(param_pdb_id)).readlines() viewer = PdbTools.RasmolViewInline(pdb_lines, sys.stdout) viewer.Command("echo %s" % message) if options.filename_fasta: infile = open(options.filename_fasta, "r") description, reference_sequence = IOTools.readSequence(infile) infile.close() else: reference_sequence = None if DEBUG: viewer.Command("echo cmdline: %s" % (string.join(sys.argv, " "))) if not pdb_lines: viewer.Command("echo error: structure not found in local database") viewer.WriteScript() sys.exit() if reference_sequence: map_pdb2seq, rmap_pdb2seq, rmap_seq2pdb, lstructure, first_residue, last_residue, sequence = PdbTools.buildMapPdb2Sequence( reference_sequence, options.filename_pdb, options,
#python ..\ChromatinImagingV2\Scripts\BatchAllnew.py import sys, os #add path workbookDir = os.getcwd() sys.path.append(os.path.dirname(workbookDir) + os.sep + r'\CommonTools') import IOTools as io if __name__ == "__main__": script = r'"' + workbookDir + os.sep + r'BatchSequentialSmall2colV3.py"' str_runs = [] for i in range(10): str_runs.append('python ' + script + ' ' + str(i)) io.batch_command(str_runs, batch_size=10)
dest="dump", action="store_true", help="dump output.") parser.set_defaults( separator="|", dump=False, filename_map=None, filename_alignment="-", filename_tree=None, ) (options, args) = E.Start(parser) if options.filename_map: map_species2sp = IOTools.ReadMap(open(options.filename_map, "r")) E.debug("species map: %s" % str(map_species2sp)) identifier_parser = IdentifierParserGPipe(map_species2sp=map_species2sp) njtree = NJTree(identifier_parser=identifier_parser) njtree.SetLog(options.stdlog) njtree.SetErr(options.stderr) if options.filename_tree: njtree.SetSpeciesTree(options.filename_tree) mali = Mali.Mali() if options.filename_alignment == "-":
def main(argv=None): parser = E.OptionParser( version= "%prog version: $Id: CBioPortal.py 2888 2012-06-07 15:52:00Z ians $", usage=globals()["__doc__"]) parser.add_option( "-o", "--output_file", type="string", default=None, help="[Optional] Filename to output results to. [default=STDOUT]") parser.add_option( "-u", "--url", type="string", default="http://www.cbioportal.org/public-portal/webservice.do", help="[Optional] Url to the cBioPortal webservice [default=%default]") cqueryopts = optparse.OptionGroup(parser, "Common parameters", "Common arguments to the query") cqueryopts.add_option( "-s", "--study_id", dest="study_id", type="string", default=None, help= "[Required/OPtional] cBioPortal ID for study [default=%default].\n This or study_name required for: getGeneticProfiles, getCaseLists, getProteinArrayInfo, getLink,getOncoprintHTML, getPercentAltered, getTotalAltered" ) cqueryopts.add_option( "-n", "--study_name", dest="study_name", type="string", default=None, help= "[Required/Optional] cBioPortal Name for study [defualt=%default].\n See above for which commands require this." ) cqueryopts.add_option( "-c", "--case_set_id", dest="case_set_id", type="string", default=None, help= "[Required for some] cBioPortal case_set_id specifying the case list to use.\nRequired for getProfileData, getMutationData, getClincalData, getProteinArrayData, getPercentAltered, getTotalAltered. Default is case_set_id for case list 'All Tumours' " ) cqueryopts.add_option( "-g", "--gene_list", dest="gene_list", type="string", default=None, help= "[Required for some] Comma seperated list of HUGO gene symbols or Entrez gene IDs.\nRequired for getProfileData, getMutationData, getLink, getOncoprintHTML" ) cqueryopts.add_option("-f", "--gene_list_file", dest="gene_list_file", type="string", default=None, help="[Optional] Filename to read in gene_list from") cqueryopts.add_option( "-p", "--profile_id", dest="profile_id", type="string", help= "[Optional] Comma seperated list of cBioPortal genetic_profile_ids. If none are specified then the list of profiles for the study where display in analysis is True is used." ) squeryopts = optparse.OptionGroup( parser, "Query specific parameters", "Arguments specific to a particular query") squeryopts.add_option( "--protein_array_type", dest="protein_array_type", type="string", default="protein_level", help= "[Optional] Either protein_level or phosphorylation [default=%default]" ) squeryopts.add_option( "--protein_array_id", dest="protein_array_id", type="string", help= "[Required for some] comma seperated list of one or more protein array IDs" ) squeryopts.add_option( "--array_info", dest="protein_array_info", type="int", default=0, help= "[Optional] If 1, antibody infomation will also be exported in a getProteinArrayData query [default=%default]" ) squeryopts.add_option( "--report", dest="report", type="string", default="full", help= "[Optional] Report type to display for getLink. Either full or oncoprint_html [default=%default] " ) squeryopts.add_option( "--threshold", dest="threshold", type="int", default=2, help= "[Optional] Threshold for deciding if an alteration is significant for continuous metrics [default=%default]" ) parser.add_option_group(cqueryopts) parser.add_option_group(squeryopts) (options, args) = E.Start(parser, add_pipe_options=False, add_output_options=False, argv=argv) portal = CBioPortal(url=options.url, study=options.study_id, study_name=options.study_name, case_list_id=options.case_set_id) results = [] if options.gene_list_file: infile = IOTools.openFile(options.gene_list_file) gene_list = [x.strip() for x in infile] elif options.gene_list: gene_list = options.gene_list.split(",") if options.profile_id: profile_id = options.profile_id.split(",") else: profile_id = None if "getCancerStudies" in args: results.append(portal.getCancerStudies()) if "getGeneticProfiles" in args: results.append(portal.getGeneticProfiles()) if "getCaseLists" in args: results.append(portal.getCaseLists()) if "getProfileData" in args: results.append( portal.getProfileData(gene_list=gene_list, genetic_profile_id=profile_id)) if "getMutationData" in args: results.append( portal.getMutationData(gene_list=gene_list, genetic_profile_id=profile_id)) if "getClinicalData" in args: results.append(portal.getClinicalData()) if "getProteinArrayInfo" in args: results.append( portal.getProteinArrayInfo( gene_list=gene_list, protein_array_type=options.protein_array_type)) if "getProteinArrayData" in args: results.append( portal.getProteinArrayData( protein_array_id=options.protein_array_id, array_info=options.array_info)) if "getPercentAltered" in args: results.append( portal.getPercentAltered(gene_list=gene_list, genetic_profile_id=profile_id, threshold=options.threshold)) if "getLink" in args: results.append( portal.getLink(gene_list=gene_list, report=options.report)) if "getOncoprintHTML" in args: results.append(portal.getOncoprintHTML(gene_list=gene_list)) if len(results) == 0: sys.stderr.write("No recognised query commands provided") sys.exit() if options.output_file: outf = IOTools.openFile(options.output_file, "w") else: outf = sys.stdout for result in results: try: outf.write(tableToString(result)) except: outf.write(result) E.Stop()
def main(config_file_path): #config loading var_list = ['base_directory', 'organism', 'input_type', 'file_name', 'section_length', 'S1_iVal', 'S1_piVal', 'S2_iVal', 'S2_piVal', \ 'reference', 'optimize'] config_file_path = sys.argv[1] config = configparser.SafeConfigParser() config.read(config_file_path) #setup start_time = time.time() #Settings try: output_directory = Path(config.get('Settings', 'output_directory')) input_path = Path(config.get('Settings', 'input_path')) prefix = input_path.parts[-1].split('.')[0] #set these according to config info s1_params = pw.ParamWrapper() s2_params = pw.ParamWrapper() s1_params.setSectionLength(config.getint('Settings', 'section_length')) s1_params.setIVal(config.getfloat('Settings', 'S1_iVal')) s1_params.setPiVal(config.getfloat('Settings', 'S1_piVal')) s2_params.setIVal(config.getfloat('Settings', 'S2_iVal')) s2_params.setPiVal(config.getfloat('Settings', 'S2_piVal')) #set stuff for autogroup s2_params.setIMax(10) s2_params.setIMin(2) s2_params.setIStep(0.5) s2_params.setPiMax(10) s2_params.setPiMin(1) s2_params.setPiStep(0.5) reference = config.get('Settings', 'reference') autogroup = bool(config.getboolean('Settings', 'autogroup')) except: raise RuntimeError('Error reading configuration file') #output paths if not output_directory.is_dir(): output_directory.mkdir() os.chdir(output_directory) cytoscape_path = Path("{0}.xgmml".format(prefix)) json_path = Path("{0}.json".format(prefix)) tab_network_path = Path("chromosome_paintings.tsv") matrixout_path = Path("overall_similarity.tsv") heatmaps_path = Path("heatmaps.pdf") density_path = Path("density.txt") group_path = Path("groups.txt") tab_path = Path("tab.txt") colorout_path = Path("colors.txt") log_path = Path("log.txt") nn_out_path = Path("{0}_nn.tsv".format(prefix)) hdf_path = input_path.parent / '{0}.h5'.format(prefix) matrices_hdf_path = input_path.parent / '{0}_matrices.h5'.format(prefix) save_state_path = input_path.parent / '{0}_savestate.json'.format(prefix) #other variables logger = configLogger(log_path) #sanitize input try: assert (input_path.is_file()) assert (0 <= s1_params.getPiVal() <= 20) assert (0 <= s1_params.getIVal() <= 20) assert (0 <= s2_params.getPiVal() <= 20) assert (0 <= s2_params.getIVal() <= 20) except: raise ValueError('Configuration file contains bad values') #let's log some params used later logger.info('config loaded') #Input Processing #true means need to cluster if not io.checkPrimaryClustering(s1_params, save_state_path): logger.info('Primary Clustering exists, loading existing matrices') save_state, matrices = io.loadSaveState(save_state_path, matrices_hdf_path) sample_list = save_state['sample_list'] chr_names = save_state['chr_names'] chr_breaks = save_state['chr_breaks'] io.writeTab(sample_list, tab_path) else: #tabular data from GTAK loaded to pandas df, sample_list = loadToPandas(hdf_path, input_path, reference, s1_params, True) os.chdir(output_directory) logger.info('Start Primary Clustering') io.writeTab(sample_list, tab_path) #TODO: check for whether we're skipping primary clustering clusters, chr_names, chr_breaks = primaryCluster( df, sample_list, s1_params, logger) genNNData(clusters, chr_names, chr_breaks, s1_params, sample_list, nn_out_path) io.writePrimaryClusters(chr_names, chr_breaks, clusters, Path('pclusters.txt')) matrices = at.clustersToMatrix(clusters, sample_list) logger.info('Writing Save State') io.writeSaveState(s1_params, sample_list, chr_names, chr_breaks, matrices, save_state_path, matrices_hdf_path) overall_matrix = at.overallMatrix(matrices) logger.info('Start Secondary Clustering') group_names, overall_clusters = sc.group(overall_matrix, tab_path, group_path, s2_params, logger, autogroup) color_table = at.createColorTable(group_names, overall_clusters, sample_list) color_table.to_csv(colorout_path) logger.info('calculating composition') condensed_matrices = gc.condenseToGroupMatrix(matrices, group_names, overall_clusters, sample_list) composition = gc.getChromosomePaintings(condensed_matrices, chr_breaks, overall_clusters, group_names, sample_list) # for the whole thing logger.info('writing output') io.writeTabularPainting(composition, chr_names, s1_params.getSectionLength(), sample_list, tab_network_path) io.writeOverallMatrix(overall_matrix, matrixout_path) exporter.parse(overall_matrix, color_table, composition, group_names, overall_clusters, sample_list, prefix) print("PopNet Completed") print('Run time was {0} seconds'.format(time.time() - start_time))
-1] #zxy coords of chromosomes already in the right position #Decide where to save the candidate positions of the hybe fl_cands = analysis_folder + os.sep + file_.replace( '.dax', '__current_cand.pkl') #file where to save candidates fl_cor = analysis_folder + os.sep + file_.replace( '.dax', '__drift.pkl') #file where to save drift correction fl_cor_fls = analysis_folder + os.sep + file_.replace( '.dax', '__driftfiles.npy') print fl_cands candid_spot = {} #load data (pseudo-memory mapping) daxs_signal, names_signal, daxs_beads, names_beads = io.get_ims_fov_daxmap( folders_keep, file_, col_tags=None, pad=10) #compute the drift for the field of view if len(daxs_beads) > 1: txyz_both, _ = ft.get_STD_beaddrift_v2(daxs_beads, sz_ex=sz_ex, hseed=hseed_beads, nseed=nbeads, ref=None, force=force_drift, save_file=fl_cor) txyz = np.mean(txyz_both, 1) txyz = np.array(txyz) - [txyz[ref]] np.save(fl_cor_fls, np.array(folders_keep)) #repeat for colors #num_col = int(len(daxs_signal)/len(daxs_beads)) #iterate through folders
headers, titles, sets = [], [], [] if options.headers: if options.headers == "-": headers = args else: headers = options.headers.split(",") if len(headers) != len(args): raise ValueError( "please supply the same number of headers as there are filenames." ) for f in args: if options.with_title: title, data = IOTools.readList(open(f, "r"), with_title=options.with_title) titles.append(title) else: data = IOTools.readList(open(f, "r")) sets.append(set(data)) if not headers and titles: headers = titles else: headers = args for x in range(len(sets) - 1): set1 = sets[x] for y in range(x + 1, len(sets)): set2 = sets[y]
def buildSCOPDomains( infiles, outfile ): '''reconcile mapped domains into a single domain file. * fragments are removed - a domain must map at least 90% of its length. * domains overlapping on the same sequence with the same superfamily classification are merged. ''' linksfile, fastafile = infiles # filtering criteria min_coverage = 0.9 # only take first four fold classes classes = 'abcd' rx = re.compile('(\S+)\s(\S+)\s(.*)' ) id2class = {} with IOTools.openFile( fastafile ) as inf: for x in FastaIterator.iterate( inf ): pid, cls, description = rx.match(x.title).groups() id2class[pid] = (cls, len(x.sequence) ) E.info('read mappings for %i sequences' % len(id2class)) counter = E.Counter() with IOTools.openFile( linksfile ) as inf: nid2domains = collections.defaultdict( list ) ndomains = 0 for line in inf: if line.startswith('query_nid'): continue if line.startswith('#'): continue counter.links += 1 domain_id, nid, evalue, domain_start, domain_end, sbjct_start, sbjct_end, \ block_sizes, domain_starts, sbjct_starts, \ bitscore, pid = line[:-1].split() nid, domain_start, domain_end, sbjct_start, sbjct_end = map(int, \ ( nid, domain_start, domain_end, sbjct_start, sbjct_end )) family, length = id2class[domain_id] cls, fold, superfamily, family = family.split('.') if cls not in classes: continue if float(domain_end - domain_start) / length < min_coverage: continue counter.unmerged_domains += 1 superfamily = '00%c%03i%03i' % (cls, int(fold), int(superfamily)) nid2domains[nid].append( (superfamily, sbjct_start, sbjct_end ) ) counter.sequences = len(nid2domains) E.info( 'merging %i domains in %i sequences' % (counter.unmerged_domains, counter.sequences)) outf = IOTools.openFile( outfile, 'w' ) outf.write('nid\tstart\tend\tfamily\n') for nid, dd in sorted(nid2domains.iteritems()): for family, domains in itertools.groupby( dd, key = lambda x: x[0] ): unmerged_domains = [ (x[1],x[2]) for x in domains ] merged_domains = Intervals.combine( unmerged_domains ) for start, end in merged_domains: counter.domains += 1 outf.write( '%i\t%i\t%i\t%s\n' % (nid, start, end, family ) ) outf.close() E.info( counter )
def testOutPut(self): overall = pd.DataFrame([[2, 0, 2], [0, 2, 0], [2, 0, 2]], columns=['A', 'B', 'C'], index=['A', 'B', 'C']) io.writeOverallMatrix(overall, 'zzz.txt')
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--input-format", dest="input_format", type="choice", choices=("bed", "bam"), help="input file format [default=%default].") parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-e", "--extension", dest="extension", type="int", help="extension size [default=%default].") parser.add_option("-b", "--bin-size", dest="bin_size", type="int", help="bin size of genome vector [default=%default].") parser.add_option("-l", "--fragment-length", dest="fragment_length", type="int", help="bin size of genome vector [default=%default].") parser.add_option( "-s", "--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis [default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "rms", "rpm", "all"), help="actions to perform [default=%default].") parser.add_option( "-w", "--bigwig", dest="bigwig", action="store_true", help= "store wig files as bigwig files - requires a genome file [default=%default]" ) parser.set_defaults( input_format="bam", ucsc_genome="hg19", genome_file=None, extension=400, bin_size=50, saturation_iterations=10, fragment_length=700, toolset=[], bigwig=False, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) if len(args) != 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() filename_sample = args[0] if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.Hsapiens.UCSC.%s' % options.ucsc_genome R.library(genome_file) tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) bin_size = options.bin_size extension = options.extension fragment_length = options.fragment_length saturation_iterations = options.saturation_iterations if options.input_format == "bam": E.info("converting bam files") filename_sample = bamToMEDIPS(filename_sample, os.path.join(tmpdir, "sample.medips")) elif options.input_format == "bed": E.info("converting bed files") filename_sample = bedToMEDIPS(filename_sample, os.path.join(tmpdir, "sample.medips")) E.info("loading data") R('''CONTROL.SET = MEDIPS.readAlignedSequences( BSgenome = "%(genome_file)s", file = "%(filename_sample)s" ) ''' % locals()) slotnames = (("extend", "extend", "%i"), ("distFunction", "distance_function", "%s"), ("slope", "slope", "%f"), ("fragmentLength", "fragment_length", "%i"), ("bin_size", "bin_size", "%i"), ("seq_pattern", "pattern", "%s"), ("number_regions", "nregions", "%i"), ("number_pattern", "npatterns", "%i"), ("cali_chr", "calibration_contig", "%s"), ("genome_name", "genome", "%s")) E.info("computing genome vector") R('''CONTROL.SET = MEDIPS.genomeVector(data = CONTROL.SET, bin_size = %(bin_size)i, extend=%(extension)i )''' % locals()) E.info("computing CpG positions") R('''CONTROL.SET = MEDIPS.getPositions(data = CONTROL.SET, pattern = "CG")''' ) E.info("compute coupling vector") R('''CONTROL.SET = MEDIPS.couplingVector(data = CONTROL.SET, fragmentLength = %(fragment_length)i, func = "count")''' % locals()) E.info("compute calibration curve") R('''CONTROL.SET = MEDIPS.calibrationCurve(data = CONTROL.SET)''') E.info("normalizing") R('''CONTROL.SET = MEDIPS.normalize(data = CONTROL.SET)''') outfile = IOTools.openFile(E.getOutputFile("summary.tsv.gz"), "w") outfile.write("category\tvalue\n") if "saturation" in options.toolset or do_all: E.info("saturation analysis") R('''sr.control = MEDIPS.saturationAnalysis(data = CONTROL.SET, bin_size = %(bin_size)i, extend = %(extension)i, no_iterations = %(saturation_iterations)i, no_random_iterations = 1)''' % locals()) R.png(E.getOutputFile("saturation.png")) R('''MEDIPS.plotSaturation(sr.control)''') R('''dev.off()''') R('''write.csv( sr.control$estimation, file ='%s' )''' % E.getOutputFile("saturation_estimation.csv")) outfile.write("estimated_correlation\t%f\n" % R('''sr.control$maxEstCor''')[1]) outfile.write("true_correlation\t%f\n" % R('''sr.control$maxTruCor''')[1]) if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") R('''cr.control = MEDIPS.coverageAnalysis(data = CONTROL.SET, extend = %(extension)i, no_iterations = 10)''' % locals()) R.png(E.getOutputFile("cpg_coverage.png")) R('''MEDIPS.plotCoverage(cr.control)''') R('''dev.off()''') # three rows R('''write.csv( cr.control$coveredPos, file ='%s' )''' % E.getOutputFile("saturation_coveredpos.csv")) # coverage threshold # number of CpG covered # percentage of CpG covered R('''write.csv( cr.control$matrix, file ='%s' )''' % E.getOutputFile("saturation_matrix.csv")) # R('''er.control = MEDIPS.CpGenrich(data = CONTROL.SET)''') if "calibration" in options.toolset or do_all: E.info("plotting calibration") R.png(E.getOutputFile("calibration.png")) R('''MEDIPS.plotCalibrationPlot(data = CONTROL.SET, linearFit = T, xrange=250)''' ) R('''dev.off()''') for slotname, label, pattern in slotnames: value = tuple(R('''CONTROL.SET@%s''' % slotname)) if len(value) == 0: continue outfile.write( "%s\t%s\n" % (label, pattern % tuple(R('''CONTROL.SET@%s''' % slotname))[0])) outfile.close() if "rpm" in options.toolset or do_all: outputfile = E.getOutputFile("rpm.wig") R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = T, descr = "rpm")''' % locals()) if options.bigwig: bigwig(outputfile, contig_sizes) else: compress(outputfile) if "rms" in options.toolset or do_all: outputfile = E.getOutputFile("rms.wig") R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = F, descr = "rms")''' % locals()) if options.bigwig: bigwig(outputfile, contig_sizes) else: compress(outputfile) shutil.rmtree(tmpdir) ## write footer and output benchmark information. E.Stop()