def buildCodingPotential(infile, outfile): '''run CPC analysis as in the cpc script. This module runs framefinder and blastx on both strands. It seems to work, but I have not thoroughly tested it. I expect that the false positive rate increases (i.e., predicting non-coding as coding) in cases where the best framefinder match and the best blast match are on opposite strands. In the original CPC, these would be separated. ''' try: cpc_dir = os.environ["CPC_HOME"] except KeyError: raise ValueError("CPC_HOME environment variable is not set. ") tmpdir = P.getTempDir(".") track = P.snip(outfile, ".coding.gz") # extract features for frame finder # replaces extract_framefinder_feats.pl to parse both strands with open(os.path.join(tmpdir, "ff.feat"), "w") as outf: outf.write( "\t".join(("QueryID", "CDSLength", "Score", "Used", "Strict")) + "\n") for line in IOTools.openFile("%s.frame.gz" % track): if line.startswith(">"): try: (id, start, end, score, used, mode, tpe) = \ re.match( ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line).groups() except AttributeError: raise ValueError("parsing error in line %s" % line) length = int(end) - int(start) + 1 strict = int(tpe == "strict") outf.write( "\t".join((id, str(length), used, str(strict))) + "\n") to_cluster = USECLUSTER # extract features and prepare svm data s = [] s.append(''' zcat %(infile)s | perl %(cpc_dir)s/libs/blast2table.pl | tee %(tmpdir)s/blastx.table | perl %(cpc_dir)s/bin/extract_blastx_features.pl > %(tmpdir)s/blastx.feat1; ''') s.append(''' cat %(track)s_norepeats.fasta | perl %(cpc_dir)s/bin/add_missing_entries.pl %(tmpdir)s/blastx.feat1 > %(tmpdir)s/blastx.feat; ''') # step 2 - prepare data s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat > %(tmpdir)s/blastx.lsv; ''') s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat > %(tmpdir)s/ff.lsv; ''') s.append(''' perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv > %(tmpdir)s/test.lsv; ''') s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale -r %(cpc_dir)s/data/libsvm.range %(tmpdir)s/test.lsv > %(tmpdir)s/test.lsv.scaled; ''') # step 3: prediction m_libsvm_model0 = os.path.join(cpc_dir, "data/libsvm.model0") # standard m_libsvm_model = os.path.join(cpc_dir, "data/libsvm.model") # Prob m_libsvm_model2 = os.path.join( cpc_dir, "data/libsvm.model2") # Prob + weighted version m_libsvm_range = os.path.join(cpc_dir, "data/libsvm.range") s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2 %(tmpdir)s/test.lsv.scaled %(m_libsvm_model0)s %(tmpdir)s/test.svm0.predict > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr; ''') s.append(''' printf "gene_id\\tlength\\tresult\\tvalue\\n" | gzip > %(outfile)s; cat %(tmpdir)s/test.svm0.predict | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta | gzip >> %(outfile)s; ''') # generate reports s.append('''cat %(tmpdir)s/blastx.feat | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz) | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf; gzip %(outfile)s.orf %(outfile)s.homology; ''') # now run it all statement = " checkpoint; ".join(s) P.run() # clean up shutil.rmtree(tmpdir)
def filterBamfiles(infile, sentinel): """ Pre-process bamfiles prior to peak calling. i) sort bamfiles ii) remove unmapped readswith bam2bam.py iii) remove non-uniquely mapping reads with bam2bam.py (optional) iv) remove duplicates with Picards MarkDuplicates (optional) v) remove reads from masked regions with bedtools intersect (optional) vi) index """ # create tempfile for Picard's MarkDuplicates picard_tmp = P.getTempDir(PARAMS["scratchdir"]) outfile = P.snip(sentinel, ".sentinel") + ".bam" # ensure bamfile is sorted, statement = [ "samtools sort @IN@ -o @[email protected]", ] # remove unmapped reads statement.append("cgat bam2bam" " --method=filter --filter-method=mapped" " --log=%(outfile)s.log" " < @[email protected]" " > @OUT@") # remove non-uniquely mapping reads, if requested if PARAMS["filter_remove_non_unique"]: statement.append("cgat bam2bam" " --method=filter --filter-method=unique" " --log=%(outfile)s.log" " < @IN@" " > @OUT@") # remove duplicates, if requested if PARAMS["filter_remove_duplicates"]: statement.append("MarkDuplicates" " INPUT=@IN@" " ASSUME_SORTED=true" " REMOVE_DUPLICATES=true" " QUIET=false" " OUTPUT=@OUT@" " METRICS_FILE=/dev/null" " VALIDATION_STRINGENCY=SILENT" " TMP_DIR=%(picard_tmp)s" " 2> %(outfile)s.log") # mask regions, if intervals supplied if PARAMS["filter_mask_intervals"]: mask = PARAMS["filter_mask_intervals"] statement.append("bedtools intersect" " -abam @IN@" " -b %(mask)s" " -wa" " -v" " > @OUT@") statement.append("mv @IN@ %(outfile)s") statement.append("samtools index %(outfile)s") job_memory = "5G" statement = P.joinStatements(statement, infile) P.run() P.touch(sentinel) shutil.rmtree(picard_tmp)