Exemplo n.º 1
0
def buildCodingPotential(infile, outfile):
    '''run CPC analysis as in the cpc script.

    This module runs framefinder and blastx on both strands.
    It seems to work, but I have not thoroughly tested it.
    I expect that the false positive rate increases (i.e.,
    predicting non-coding as coding) in cases where the best
    framefinder match and the best blast match are on opposite
    strands. In the original CPC, these would be separated.
    '''

    try:
        cpc_dir = os.environ["CPC_HOME"]
    except KeyError:
        raise ValueError("CPC_HOME environment variable is not set. ")

    tmpdir = P.getTempDir(".")
    track = P.snip(outfile, ".coding.gz")

    # extract features for frame finder
    # replaces extract_framefinder_feats.pl to parse both strands
    with open(os.path.join(tmpdir, "ff.feat"), "w") as outf:
        outf.write(
            "\t".join(("QueryID", "CDSLength", "Score", "Used", "Strict")) + "\n")
        for line in IOTools.openFile("%s.frame.gz" % track):
            if line.startswith(">"):
                try:
                    (id, start, end, score, used, mode, tpe) = \
                        re.match(
                            ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line).groups()
                except AttributeError:
                    raise ValueError("parsing error in line %s" % line)
                length = int(end) - int(start) + 1
                strict = int(tpe == "strict")
                outf.write(
                    "\t".join((id, str(length), used, str(strict))) + "\n")

    to_cluster = USECLUSTER

    # extract features and prepare svm data
    s = []

    s.append('''
    zcat %(infile)s
    | perl %(cpc_dir)s/libs/blast2table.pl
    | tee %(tmpdir)s/blastx.table
    | perl %(cpc_dir)s/bin/extract_blastx_features.pl
    > %(tmpdir)s/blastx.feat1;
    ''')

    s.append('''
    cat %(track)s_norepeats.fasta
    | perl %(cpc_dir)s/bin/add_missing_entries.pl
       %(tmpdir)s/blastx.feat1
    > %(tmpdir)s/blastx.feat;
    ''')

    # step 2 - prepare data
    s.append('''
    perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat
    > %(tmpdir)s/blastx.lsv;
    ''')

    s.append('''
    perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat
    > %(tmpdir)s/ff.lsv;
    ''')

    s.append('''
    perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv
    > %(tmpdir)s/test.lsv;
    ''')

    s.append('''
    %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale
               -r %(cpc_dir)s/data/libsvm.range
               %(tmpdir)s/test.lsv
    > %(tmpdir)s/test.lsv.scaled;
    ''')

    # step 3: prediction
    m_libsvm_model0 = os.path.join(cpc_dir, "data/libsvm.model0")  # standard
    m_libsvm_model = os.path.join(cpc_dir, "data/libsvm.model")  # Prob
    m_libsvm_model2 = os.path.join(
        cpc_dir, "data/libsvm.model2")  # Prob + weighted version
    m_libsvm_range = os.path.join(cpc_dir, "data/libsvm.range")

    s.append('''
               %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2
               %(tmpdir)s/test.lsv.scaled
               %(m_libsvm_model0)s
               %(tmpdir)s/test.svm0.predict
    > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr;
    ''')

    s.append('''
    printf "gene_id\\tlength\\tresult\\tvalue\\n"
    | gzip > %(outfile)s;
    cat %(tmpdir)s/test.svm0.predict
    | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta
    | gzip >> %(outfile)s;
    ''')

    # generate reports
    s.append('''cat %(tmpdir)s/blastx.feat
    | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz)
    | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf;
    gzip %(outfile)s.orf %(outfile)s.homology;
    ''')

    # now run it all
    statement = " checkpoint; ".join(s)
    P.run()

    # clean up
    shutil.rmtree(tmpdir)
Exemplo n.º 2
0
def filterBamfiles(infile, sentinel):
    """
    Pre-process bamfiles prior to peak calling.
    i) sort bamfiles
    ii) remove unmapped readswith bam2bam.py
    iii) remove non-uniquely mapping reads with bam2bam.py (optional)
    iv) remove duplicates with Picards MarkDuplicates (optional)
    v) remove reads from masked regions with bedtools intersect (optional)
    vi) index
    """

    # create tempfile for Picard's MarkDuplicates
    picard_tmp = P.getTempDir(PARAMS["scratchdir"])

    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    # ensure bamfile is sorted,
    statement = [
        "samtools sort @IN@ -o @[email protected]",
    ]

    # remove unmapped reads
    statement.append("cgat bam2bam"
                     " --method=filter --filter-method=mapped"
                     " --log=%(outfile)s.log"
                     " < @[email protected]"
                     " > @OUT@")

    # remove non-uniquely mapping reads, if requested
    if PARAMS["filter_remove_non_unique"]:
        statement.append("cgat bam2bam"
                         " --method=filter --filter-method=unique"
                         " --log=%(outfile)s.log"
                         " < @IN@"
                         " > @OUT@")

    # remove duplicates, if requested
    if PARAMS["filter_remove_duplicates"]:
        statement.append("MarkDuplicates"
                         " INPUT=@IN@"
                         " ASSUME_SORTED=true"
                         " REMOVE_DUPLICATES=true"
                         " QUIET=false"
                         " OUTPUT=@OUT@"
                         " METRICS_FILE=/dev/null"
                         " VALIDATION_STRINGENCY=SILENT"
                         " TMP_DIR=%(picard_tmp)s"
                         " 2> %(outfile)s.log")

    # mask regions, if intervals supplied
    if PARAMS["filter_mask_intervals"]:
        mask = PARAMS["filter_mask_intervals"]
        statement.append("bedtools intersect"
                         " -abam @IN@"
                         " -b %(mask)s"
                         " -wa"
                         " -v"
                         " > @OUT@")

    statement.append("mv @IN@ %(outfile)s")
    statement.append("samtools index %(outfile)s")

    job_memory = "5G"
    statement = P.joinStatements(statement, infile)

    P.run()
    P.touch(sentinel)
    shutil.rmtree(picard_tmp)