예제 #1
0
def output_global_alignment(kmers_fname, output_dir):
    """
    Output a global alignment (*.aln) for a set of kmers.

    Using clustawl for now.

    Parameters:
    -----------
    kmers_fname : filename of FASTA file containing kmers
    output_dir : output directory
    """
    utils.make_dir(output_dir)
    output_fname = \
        os.path.join(output_dir,
                     "%s.aln" %(os.path.basename(kmers_fname)))
    if os.path.isfile(output_fname):
        print "Alignment filename %s exists. Skipping..." \
              %(output_fname)
    clustalw_cmd = \
        "clustalw -INFILE=%s -OUTFILE=%s -PIM" %(kmers_fname,
                                                 output_fname)
    print "Executing: %s" % (clustalw_cmd)
    t1 = time.time()
    os.system(clustalw_cmd)
    t2 = time.time()
    print "Global alignment took %.2f minutes." % ((t2 - t1) / 60.)
    return output_fname
def sanitize_splicegraph_events(genome, event_type, splicegraph_dir,
                                output_dir):
    """
    Sanitize and annotate old SpliceGraph events before
    merging with new events.
    """
    gff_fname = os.path.join(splicegraph_dir, genome,
                             "%s.%s.gff3" % (event_type, genome))
    if not os.path.isfile(gff_fname):
        print "Cannot find %s" % (gff_fname)
        return
    # Make output directory for sanitized files
    output_dir = os.path.join(output_dir, genome)
    utils.make_dir(output_dir)
    print "Sanitizing: %s" % (gff_fname)
    gff_label = os.path.basename(gff_fname)
    output_fname = os.path.join(output_dir, gff_label)
    if os.path.isfile(output_fname):
        print "%s already exists, skipping" % (output_fname)
        return
    sanitize_cmd = \
        "gffutils-cli sanitize %s > %s" %(gff_fname, output_fname)
    ret_val = os.system(sanitize_cmd)
    if ret_val != 0:
        raise Exception, "Sanitize command failed."
    # Now that it is sanitized, annotate it
    print "Annotating GFF..."
    gffutils_helpers.annotate_gff(output_fname, genome)
예제 #3
0
파일: exons.py 프로젝트: 1eesh/rnaseqlib
def get_const_exons(gff_filename, output_filename,
                    base_diff=5):
    """
    Get constitutive exons for GFF filename.

    - base_diff: Number of bases +/- that can be omitted when
      for an exon to be considered constitutive.
    """
    print "Getting constitutive exons from: %s" %(gff_filename)
    dir_name = os.path.dirname(output_filename)
    if not os.path.isdir(dir_name):
        utils.make_dir(dir_name)
    print "Loading GFF file..."
    gff_db = gff_utils.GFFDatabase(from_filename=gff_filename,
                                   reverse_recs=True)
    print "Done loading."
    gff_out = gff_utils.GFFWriter(open(output_filename, "w"))
    
    for gene, mRNAs in gff_db.mRNAs_by_gene.iteritems():
        # Get constitutive exons from the current set
        # of mRNAs
        const_exons = const_exons_from_mRNAs(gff_in, mRNAs)
        for exon_rec in const_exons:
            # Write exons to file
            gff_out.write_rec(exon_rec)
예제 #4
0
def output_dinuc_enriched_kmers(logger,
                                fasta_fname,
                                output_dir,
                                kmer_lens,
                                num_shuffles=100):
    """
    Output enriched kmers in a FASTA file relative to
    a dinucleotide shuffled version of it.
    """
    logger.info("Output dinucleotide enriched Kmers..")
    logger.info("  - Input FASTA: %s" % (fasta_fname))
    logger.info("  - Output dir: %s" % (output_dir))
    utils.make_dir(output_dir)
    # Shuffle the FASTA
    shuffled_dir = os.path.join(output_dir, "shuffled_fasta")
    utils.make_dir(shuffled_dir)
    shuffled_fasta = ShuffledFasta(fasta_fname, shuffled_dir)
    for kmer_len in kmer_lens:
        kmers = Kmers(kmer_len,
                      fasta_fname=fasta_fname,
                      shuffled_fasta=shuffled_fasta)
        output_basename = \
            "%s.%d_kmers.counts" %(os.path.basename(fasta_fname),
                                   kmer_len)
        enrichment_fname = os.path.join(output_dir, output_basename)
        logger.info("Outputting enriched Kmers to: %s" % (enrichment_fname))
        if not os.path.isfile(enrichment_fname):
            # Get the enriched kmers
            results = kmers.get_enriched_kmers(output_dir,
                                               num_shuffles=num_shuffles)
            # Output enrichment result
            kmers.output_enriched_kmers(results, enrichment_fname)
        else:
            logger.info("Found %s, skipping.. " % (enrichment_fname))
예제 #5
0
def output_table_seqs(table_gff_fname, fi_fname, output_dir):
    """
    Output table sequences to a file.
    """
    print "Outputting sequences from GFF table..."
    print "  - Input GFF table: %s" %(table_gff_fname)
    print "  - Genome FASTA index: %s" %(fi_fname)
    print "  - Output dir: %s" %(output_dir)
    utils.make_dir(output_dir)
    table_basename = os.path.basename(table_gff_fname).rsplit(".", 1)[0]
    output_fname = os.path.join(output_dir, "%s.fa" %(table_basename))
    print "  - Output file: %s" %(output_fname)
    if os.path.isfile(output_fname):
        print "Found %s. Skipping..." %(output_fname)
        return output_fname
    entries = pybedtools.BedTool(table_gff_fname)
    def fields2name(f):
        """
        replace GFF featuretype field with the attributes field.
        """
        #f[2] = f[-1]
        custom_field = "%s:%s-%s:%s" %(f.chrom, f.start, f.stop, f.strand)
        f[2] = "%s;%s" %(custom_field, f[-1])
        return f
    # Output sequences as FASTA
    try:
        entries.each(fields2name).sequence(fi=fi_fname, fo=output_fname,
                                           s=True, name=True)
    except pybedtools.helpers.BEDToolsError as s:
        pass
    return output_fname
예제 #6
0
def output_intron_table(tables_dir, intron_gff_fname, output_dir):
    """
    Output a table of introns. Just adds length
    and gene information to each entry.
    """
    print "Outputting intron table from %s" % (intron_gff_fname)
    output_basename = os.path.basename(intron_gff_fname).rsplit(".", 1)[0]
    utils.make_dir(output_dir)
    output_fname = os.path.join(output_dir, "%s.gff" % (output_basename))
    print "  - Output file: %s" % (output_fname)
    if not os.path.isfile(intron_gff_fname):
        raise Exception, "Cannot find %s" % (intron_gff_fname)
    trans_to_gene = trans_to_gene_from_table(tables_dir)
    table_fname = os.path.join(tables_dir, "ensGene.kgXref.combined.txt")
    table_df = pandas.read_table(table_fname, sep="\t")
    trans_to_gene = trans_to_gene_from_table(tables_dir)
    intron_entries = pybedtools.BedTool(intron_gff_fname)
    output_file = open(output_fname, "w")
    for entry in intron_entries:
        transcripts = entry.attrs["Parent"].split(",")
        genes_str = \
            ",".join([trans_to_gene[trans] for trans in transcripts])
        entry.attrs["gene_id"] = genes_str
        entry.attrs["region_len"] = str(len(entry))
        output_file.write(str(entry))
    output_file.close()
    return output_fname
예제 #7
0
def output_table_seqs(table_gff_fname, fi_fname, output_dir):
    """
    Output table sequences to a file.
    """
    print "Outputting sequences from GFF table..."
    print "  - Input GFF table: %s" % (table_gff_fname)
    print "  - Genome FASTA index: %s" % (fi_fname)
    print "  - Output dir: %s" % (output_dir)
    utils.make_dir(output_dir)
    table_basename = os.path.basename(table_gff_fname).rsplit(".", 1)[0]
    output_fname = os.path.join(output_dir, "%s.fa" % (table_basename))
    print "  - Output file: %s" % (output_fname)
    if os.path.isfile(output_fname):
        print "Found %s. Skipping..." % (output_fname)
        return output_fname
    entries = pybedtools.BedTool(table_gff_fname)

    def fields2name(f):
        """
        replace GFF featuretype field with the attributes field.
        """
        #f[2] = f[-1]
        custom_field = "%s:%s-%s:%s" % (f.chrom, f.start, f.stop, f.strand)
        f[2] = "%s;%s" % (custom_field, f[-1])
        return f

    # Output sequences as FASTA
    try:
        entries.each(fields2name).sequence(fi=fi_fname,
                                           fo=output_fname,
                                           s=True,
                                           name=True)
    except pybedtools.helpers.BEDToolsError as s:
        pass
    return output_fname
예제 #8
0
파일: tables.py 프로젝트: hjanime/rnaseqlib
def download_ucsc_tables(genome,
                         output_dir):
    """
    Download all relevant UCSC tables for a given genome.
    """
    tables_outdir = os.path.join(output_dir, "ucsc")
    utils.make_dir(tables_outdir)
    print "Download UCSC tables..."
    print "  - Output dir: %s" %(tables_outdir)
    ucsc_tables = get_ucsc_tables_urls(genome)
    for table_label, table_url in ucsc_tables:
        print "Downloading %s" %(table_label)
        # If the table exists in uncompressed form, don't download it
        table_filename = os.path.join(tables_outdir, table_label)
        unzipped_table_fname = table_filename[0:-3]
        if os.path.isfile(unzipped_table_fname):
            print "Got %s already. Skipping download.." \
                %(unzipped_table_fname)
            continue
        # Download table
        download_status = download_utils.download_url(table_url,
                                                      tables_outdir)
        if download_status is None:
            print "Failed to get %s, skipping.." %(table_label)
            continue
        # Uncompress table
        utils.gunzip_file(table_filename, tables_outdir)
예제 #9
0
def output_global_alignment(kmers_fname, output_dir):
    """
    Output a global alignment (*.aln) for a set of kmers.

    Using clustawl for now.

    Parameters:
    -----------
    kmers_fname : filename of FASTA file containing kmers
    output_dir : output directory
    """
    utils.make_dir(output_dir)
    output_fname = \
        os.path.join(output_dir,
                     "%s.aln" %(os.path.basename(kmers_fname)))
    if os.path.isfile(output_fname):
        print "Alignment filename %s exists. Skipping..." \
              %(output_fname)
    clustalw_cmd = \
        "clustalw -INFILE=%s -OUTFILE=%s -PIM" %(kmers_fname,
                                                 output_fname)
    print "Executing: %s" %(clustalw_cmd)
    t1 = time.time()
    os.system(clustalw_cmd)
    t2 = time.time()
    print "Global alignment took %.2f minutes." %((t2 - t1)/60.)
    return output_fname
예제 #10
0
def main():
    genomes = ["mm9", "mm10",
               "hg18", "hg19"]
    event_types = ["SE", "MXE", "A3SS", "A5SS", "RI"]
    # Directory where UCSC tables are
    ucsc_tables_dir = os.path.expanduser("~/jaen/ucsc_tables/")
    events_dir = os.path.expanduser("~/jaen/gff-events/ver2/")
    for genome in genomes:
        print "Making annotations for %s" %(genome)
        output_dir = os.path.join(events_dir, genome)
        curr_tables_dir = os.path.join(ucsc_tables_dir, genome)
        utils.make_dir(output_dir)
        cmd = \
            "gff_make_annotation %s %s --genome-label %s --sanitize " \
            %(curr_tables_dir,
              output_dir,
              genome)
        print "Executing: "
        print cmd
        #os.system(cmd)
    #Annotate the GFFs with gene information
    gff_fnames = []
    for genome in genomes:
        commonshortest_dir = \
            os.path.join(events_dir, genome, "commonshortest")
        for event_type in event_types:
            curr_gff = os.path.join(commonshortest_dir,
                                    "%s.%s.gff3" %(event_type, genome))
            gffutils_helpers.annotate_gff(curr_gff, genome)
            # Clean up empty attributes
            print "Cleaning up empty attributes"
            remove_empty_attrs.run(curr_gff)
    # Zip the annotations
    zip_annotations(events_dir, genomes)
    upload_annotations(events_dir, genomes)
예제 #11
0
    def get_dinuc_shuffled_fasta(self):
        """
        Get dinucleotide shuffled versions of the FASTA file.

        Output FASTA files to output directory.
        """
        utils.make_dir(self.output_dir)
        print "Shuffling FASTA %d times into: %s" % (self.num_shuffles,
                                                     self.output_dir)
        t1 = time.time()
        shuffled_fnames = []
        for shuffle_num in range(self.num_shuffles):
            shuffled_basename = os.path.basename(self.fasta_fname)
            # Remove FASTA extension
            shuffled_basename = shuffled_basename.rsplit(".", 1)[0]
            # Record that it's a shuffle in the filename
            shuffled_basename = "%s.shuffle_%d.fa" % (shuffled_basename,
                                                      shuffle_num)
            shuffled_fname = os.path.join(self.output_dir, shuffled_basename)
            if not os.path.isfile(shuffled_fname):
                output_dinuc_shuffled_fasta(self.fasta_fname, shuffled_fname)
            shuffled_fnames.append(shuffled_fname)
        t2 = time.time()
        print "Shuffling took %.2f seconds" % (t2 - t1)
        self.shuffled_fasta_fnames = shuffled_fnames
        return self.shuffled_fasta_fnames
예제 #12
0
def output_intron_table(tables_dir,
                        intron_gff_fname,
                        output_dir):
    """
    Output a table of introns. Just adds length
    and gene information to each entry.
    """
    print "Outputting intron table from %s" %(intron_gff_fname)
    output_basename = os.path.basename(intron_gff_fname).rsplit(".", 1)[0]
    utils.make_dir(output_dir)
    output_fname = os.path.join(output_dir, "%s.gff" %(output_basename))
    print "  - Output file: %s" %(output_fname)
    if not os.path.isfile(intron_gff_fname):
        raise Exception, "Cannot find %s" %(intron_gff_fname)
    trans_to_gene = trans_to_gene_from_table(tables_dir)
    table_fname = os.path.join(tables_dir, "ensGene.kgXref.combined.txt")
    table_df = pandas.read_table(table_fname, sep="\t")
    trans_to_gene = trans_to_gene_from_table(tables_dir)
    intron_entries = pybedtools.BedTool(intron_gff_fname)
    output_file = open(output_fname, "w")
    for entry in intron_entries:
        transcripts = entry.attrs["Parent"].split(",")
        genes_str = \
            ",".join([trans_to_gene[trans] for trans in transcripts])
        entry.attrs["gene_id"] = genes_str
        entry.attrs["region_len"] = str(len(entry))
        output_file.write(str(entry))
    output_file.close()
    return output_fname
예제 #13
0
def fix_ale_gff(gff_fname, output_dir):
    utils.make_dir(output_dir)
    fixed_gff_fname = os.path.join(output_dir,
                                   os.path.basename(gff_fname))
    gff_in = list(pybedtools.BedTool(gff_fname))
    for entries in ale_iterator(gff_in):
        fix_ale_entries(entries)
def conserved_events_mouse_to_human(event_types=["SE",
                                                 "SE_shortest_noAceView"]):
    """
    Generate conserved events for the given event types, outputting
    result to output_dir.

    Generate conserved events by mapping from mouse events to human.
    """
    mouse_genome = "mm9"
    output_dir = os.path.join(CONS_EVENTS_DIR, "mouse_to_human")
    utils.make_dir(output_dir)
    print "Generating conserved events from mouse to human..."
    print "  - Output dir: %s" %(output_dir)
    for event_type in event_types:
        print "Generating conserved events of type %s" %(event_type)
        mouse_gff_fname = \
            os.path.join(GFF_EVENTS_DIR, mouse_genome,
                         "%s.%s.gff3" %(event_type,
                                        mouse_genome))
        print "Mapping %s to human" %(mouse_gff_fname)
        if not os.path.isfile(mouse_gff_fname):
            raise Exception, "Cannot find mouse gff %s" %(mouse_gff_fname)
        cmd = \
            "bsub time python %s --get-orthologs %s \"mouse\" \"human\" --output-dir %s" \
            %(CONS_SCRIPT_FNAME,
              mouse_gff_fname,
              output_dir)
        print "Executing: %s" %(cmd)
        ret_val = os.system(cmd)
        if ret_val != 0:
            raise Exception, "Call to %s failed." %(CONS_SCRIPT_FNAME)
def conserved_events_mouse_to_human(
        event_types=["SE", "SE_shortest_noAceView"]):
    """
    Generate conserved events for the given event types, outputting
    result to output_dir.

    Generate conserved events by mapping from mouse events to human.
    """
    mouse_genome = "mm9"
    output_dir = os.path.join(CONS_EVENTS_DIR, "mouse_to_human")
    utils.make_dir(output_dir)
    print "Generating conserved events from mouse to human..."
    print "  - Output dir: %s" % (output_dir)
    for event_type in event_types:
        print "Generating conserved events of type %s" % (event_type)
        mouse_gff_fname = \
            os.path.join(GFF_EVENTS_DIR, mouse_genome,
                         "%s.%s.gff3" %(event_type,
                                        mouse_genome))
        print "Mapping %s to human" % (mouse_gff_fname)
        if not os.path.isfile(mouse_gff_fname):
            raise Exception, "Cannot find mouse gff %s" % (mouse_gff_fname)
        cmd = \
            "bsub time python %s --get-orthologs %s \"mouse\" \"human\" --output-dir %s" \
            %(CONS_SCRIPT_FNAME,
              mouse_gff_fname,
              output_dir)
        print "Executing: %s" % (cmd)
        ret_val = os.system(cmd)
        if ret_val != 0:
            raise Exception, "Call to %s failed." % (CONS_SCRIPT_FNAME)
예제 #16
0
def run_homer(logger, bed_fname, genome, output_dir, params):
    """
    Run Homer against an input BED file.

    findMotifsGenome.pl <pos file> <genome> <output directory> 
    """
    if homer_path is None:
        logger.critical("Error: Cannot find or execute Homer program.")
        sys.exit(1)
    params_str = " ".join(["%s %s" % (p, params[p]) for p in params])
    utils.make_dir(output_dir)
    # If there's a Homer results directory in the target
    # directory, then don't rerun Homer
    if os.path.isdir(os.path.join(output_dir, "homerResults")):
        logger.info("Found Homer results, skipping..")
        return output_dir
    homer_cmd = "%s %s %s %s %s" % (homer_path, bed_fname, genome, output_dir,
                                    params_str)
    logger.info("Calling Homer: ")
    logger.info("Executing: %s" % (homer_cmd))
    t1 = time.time()
    ret_val = os.system(homer_cmd)
    if ret_val != 0:
        logger.critical("Error: Homer call failed.")
        sys.exit(1)
    t2 = time.time()
    logger.info("Homer completed in %.2f minutes" % ((t2 - t1) / 60.))
    return output_dir
예제 #17
0
    def get_dinuc_shuffled_fasta(self):
        """
        Get dinucleotide shuffled versions of the FASTA file.

        Output FASTA files to output directory.
        """
        utils.make_dir(self.output_dir)
        print "Shuffling FASTA %d times into: %s" %(self.num_shuffles,
                                                    self.output_dir)
        t1 = time.time()
        shuffled_fnames = []
        for shuffle_num in range(self.num_shuffles):
            shuffled_basename = os.path.basename(self.fasta_fname)
            # Remove FASTA extension
            shuffled_basename = shuffled_basename.rsplit(".", 1)[0]
            # Record that it's a shuffle in the filename
            shuffled_basename = "%s.shuffle_%d.fa" %(shuffled_basename,
                                                     shuffle_num)
            shuffled_fname = os.path.join(self.output_dir,
                                          shuffled_basename)
            if not os.path.isfile(shuffled_fname):
                output_dinuc_shuffled_fasta(self.fasta_fname,
                                            shuffled_fname)
            shuffled_fnames.append(shuffled_fname)
        t2 = time.time()
        print "Shuffling took %.2f seconds" %(t2 - t1)
        self.shuffled_fasta_fnames = shuffled_fnames
        return self.shuffled_fasta_fnames
def sanitize_splicegraph_events(genome, event_type,
                                splicegraph_dir, output_dir):
    """
    Sanitize and annotate old SpliceGraph events before
    merging with new events.
    """
    gff_fname = os.path.join(splicegraph_dir, genome, "%s.%s.gff3" %(event_type,
                                                                     genome))
    if not os.path.isfile(gff_fname):
        print "Cannot find %s" %(gff_fname)
        return
    # Make output directory for sanitized files
    output_dir = os.path.join(output_dir, genome)    
    utils.make_dir(output_dir)
    print "Sanitizing: %s" %(gff_fname)
    gff_label = os.path.basename(gff_fname)
    output_fname = os.path.join(output_dir, gff_label)
    if os.path.isfile(output_fname):
        print "%s already exists, skipping" %(output_fname)
        return
    sanitize_cmd = \
        "gffutils-cli sanitize %s > %s" %(gff_fname, output_fname)
    ret_val = os.system(sanitize_cmd)
    if ret_val != 0:
        raise Exception, "Sanitize command failed."
    # Now that it is sanitized, annotate it
    print "Annotating GFF..."
    gffutils_helpers.annotate_gff(output_fname, genome)
예제 #19
0
def run_homer(logger, bed_fname, genome, output_dir,
              params):
    """
    Run Homer against an input BED file.

    findMotifsGenome.pl <pos file> <genome> <output directory> 
    """
    if homer_path is None:
        logger.critical("Error: Cannot find or execute Homer program.")
        sys.exit(1)
    params_str =  " ".join(["%s %s" %(p, params[p]) for p in params])
    utils.make_dir(output_dir)
    # If there's a Homer results directory in the target
    # directory, then don't rerun Homer
    if os.path.isdir(os.path.join(output_dir, "homerResults")):
        logger.info("Found Homer results, skipping..")
        return output_dir
    homer_cmd = "%s %s %s %s %s" %(homer_path,
                                   bed_fname,
                                   genome,
                                   output_dir,
                                   params_str)
    logger.info("Calling Homer: ")
    logger.info("Executing: %s" %(homer_cmd))
    t1 = time.time()
    ret_val = os.system(homer_cmd)
    if ret_val != 0:
        logger.critical("Error: Homer call failed.")
        sys.exit(1)
    t2 = time.time()
    logger.info("Homer completed in %.2f minutes" %((t2 - t1)/60.))
    return output_dir
예제 #20
0
def output_dinuc_enriched_kmers(logger,
                                fasta_fname,
                                output_dir,
                                kmer_lens,
                                num_shuffles=100):
    """
    Output enriched kmers in a FASTA file relative to
    a dinucleotide shuffled version of it.
    """
    logger.info("Output dinucleotide enriched Kmers..")
    logger.info("  - Input FASTA: %s" %(fasta_fname))
    logger.info("  - Output dir: %s" %(output_dir))
    utils.make_dir(output_dir)
    # Shuffle the FASTA
    shuffled_dir = os.path.join(output_dir, "shuffled_fasta")
    utils.make_dir(shuffled_dir)
    shuffled_fasta = ShuffledFasta(fasta_fname, shuffled_dir)
    for kmer_len in kmer_lens:
        kmers = Kmers(kmer_len,
                      fasta_fname=fasta_fname,
                      shuffled_fasta=shuffled_fasta)
        output_basename = \
            "%s.%d_kmers.counts" %(os.path.basename(fasta_fname),
                                   kmer_len)
        enrichment_fname = os.path.join(output_dir, output_basename)
        logger.info("Outputting enriched Kmers to: %s" %(enrichment_fname))
        if not os.path.isfile(enrichment_fname):
            # Get the enriched kmers
            results = kmers.get_enriched_kmers(output_dir,
                                               num_shuffles=num_shuffles)
            # Output enrichment result
            kmers.output_enriched_kmers(results, enrichment_fname)
        else:
            logger.info("Found %s, skipping.. " %(enrichment_fname))
예제 #21
0
def output_rpkm(sample,
                output_dir,
                settings_info,
                rna_base,
                logger):
    """
    Output RPKM tables for the sample.

    Takes as input:

    - sample: a sample object
    - output_dir: output directory
    - settings_info: settings information
    - rna_base: an RNABase object
    """
    # Output RPKM information for all constitutive exon tables in the
    # in the RNA Base
    print "Outputting RPKM for: %s" %(sample.label)
    rpkm_tables = {}
    for table_name, const_exons in rna_base.tables_to_const_exons.iteritems():
        rpkm_output_filename = "%s.rpkm" %(os.path.join(output_dir,
                                                        table_name))
        rpkm_tables[table_name] = rpkm_output_filename
        if os.path.isfile(rpkm_output_filename):
            logger.info("  - Skipping RPKM output, found %s" %(rpkm_output_filename))
            print "  - Skipping RPKM output, %s exists" %(rpkm_output_filename)
            continue
        # Directory where BAM containing mapping to constitutive
        # exons be stored
        bam2gff_outdir = os.path.join(output_dir,
                                      "bam2gff_const_exons")
        utils.make_dir(bam2gff_outdir)
        # Map reads to GFF of constitutive exons
        # Use the rRNA subtracted BAM file
        print "Using constitutive exons GFF -> %s" %(const_exons.gff_filename)
        exons_bam_fname = exon_utils.map_bam2gff(sample.ribosub_bam_filename,
                                                 const_exons.gff_filename,
                                                 bam2gff_outdir)
        # Compute RPKMs for sample
        num_mapped = int(sample.qc.qc_results["num_mapped"])
        if num_mapped == 0:
            logger.critical("Cannot compute RPKMs since sample %s has 0 mapped reads." \
                            %(sample.label))
            print "Error: Cannot compute RPKMs since sample %s has 0 mapped reads." \
                %(sample.label)
            sys.exit(1)
        print "Sample %s has %s mapped reads" %(sample.label, num_mapped)
        read_len = settings_info["readlen"]
        logger.info("Outputting RPKM from GFF aligned BAM (table %s)" %(table_name))
        output_rpkm_from_gff_aligned_bam(exons_bam_fname,
                                         num_mapped,
                                         read_len,
                                         const_exons,
                                         rpkm_output_filename)
    logger.info("Finished outputting RPKM for %s to %s" %(sample.label,
                                                          rpkm_output_filename))
    return rpkm_output_filename
예제 #22
0
 def __init__(self, sample, pipeline):
     # Pipeline instance that the sample is attached to
     self.pipeline = pipeline
     self.sample = sample
     self.settings_info = pipeline.settings_info
     # Define logger
     self.logger = utils.get_logger("QualityControl.%s" % (sample.label), self.pipeline.pipeline_outdirs["logs"])
     # QC header: order of QC fields to be outputted
     self.regions_header = [
         "num_ribo",
         "num_exons",
         "num_cds",
         "num_introns",
         "num_3p_utr",
         "num_5p_utr",
         "num_tRNAs",
         "num_junctions",
     ]
     self.qc_stats_header = [
         "percent_mapped",
         "percent_unique",
         "percent_ribo",
         "percent_exons",
         "percent_cds",
         "percent_introns",
         "percent_3p_utr",
         "percent_5p_utr",
         "percent_tRNAs",
         "3p_to_cds",
         "5p_to_cds",
         "3p_to_5p",
         "exon_intron_ratio",
     ]
     self.qc_header = (
         ["num_reads", "num_mapped", "num_ribosub_mapped", "num_unique_mapped"]
         + self.qc_stats_header
         + self.regions_header
     )
     # QC results
     self.na_val = "NA"
     self.qc_results = defaultdict(lambda: self.na_val)
     # QC output dir
     self.qc_outdir = self.pipeline.pipeline_outdirs["qc"]
     # QC filename for this sample
     self.sample_outdir = os.path.join(self.qc_outdir, self.sample.label)
     utils.make_dir(self.sample_outdir)
     # Regions output dir
     self.regions_outdir = os.path.join(self.sample_outdir, "regions")
     utils.make_dir(self.regions_outdir)
     self.qc_filename = os.path.join(self.sample_outdir, "%s.qc.txt" % (self.sample.label))
     self.qc_loaded = False
     # use ensGene gene table for QC computations
     self.gene_table = self.pipeline.rna_base.gene_tables["ensGene"]
     # Load QC information if file corresponding to sample
     # already exists
     self.load_qc_from_file()
예제 #23
0
파일: PsiTable.py 프로젝트: 1eesh/rnaseqlib
 def output_filtered_comparisons(self, output_dir=None,
                                 sort_column="bayes_factor",
                                 columns_to_write=[#"event_name",
                                                   "gene_id",
                                                   "gene_symbol",
                                                   "sample1_posterior_mean",
                                                   "sample1_ci_low",
                                                   "sample1_ci_high",
                                                   "sample2_posterior_mean",
                                                   "sample2_ci_low",
                                                   "sample2_ci_high",
                                                   "diff",
                                                   "bayes_factor",
                                                   "isoforms",
                                                   "sample1_counts",
                                                   "sample1_assigned_counts",
                                                   "sample2_counts",
                                                   "sample2_assigned_counts",
                                                   "chrom",
                                                   "strand",
                                                   "mRNA_starts",
                                                   "mRNA_ends"]):
     """
     Output filtered comparisons table.
     """
     if output_dir == None:
         output_dir = self.misowrap_obj.comparisons_dir
     # Output each file by event type
     output_dir = os.path.join(output_dir, "filtered_events")
     print "Outputting filtered events..."
     print "  - Output dir: %s" %(output_dir)
     utils.make_dir(output_dir)
     for event_type, filtered_df in self.filtered_events.iteritems():
         curr_output_dir = os.path.join(output_dir, event_type)
         print "Event type: %s" %(event_type)
         # View by comparison
         comparison_labels = \
             utils.unique_list(filtered_df.index.get_level_values(0))
         print "Outputting %d comparisons" %(len(comparison_labels))
         for label in comparison_labels:
             print "Comparison: %s" %(label)
             comparison_output_dir = os.path.join(curr_output_dir,
                                                  label)
             utils.make_dir(comparison_output_dir)
             output_filename = os.path.join(comparison_output_dir,
                                            "%s.%s.filtered.miso_bf" \
                                            %(label,
                                              event_type))
             print "Outputting to: %s" %(output_filename)
             curr_df = filtered_df.ix[label].sort_index(by=sort_column,
                                                        ascending=False)
             curr_df.to_csv(output_filename,
                            sep=self.delimiter,
                            float_format="%.4f",
                            cols=columns_to_write)
예제 #24
0
def intersect_events_with_genes(events_gff_fname,
                                gene_tables_dir,
                                output_dir,
                                genes_source="ensGene",
                                na_val="NA"):
    """
    Intersect GFF events with a genes table (also in GFF format).

    Computes the outermost transcription start/end bounds for each
    genes and then intersects the GFF events with these bounds.

    Outputs a mapping from event ID to one or more genes IDs that it
    maps to, if the event overlaps an annotated gene.

    - events_gff_fname: GFF events filename
    - gene_tables_dir: Directory with gene tables (created by --init module
      of rnaseqlib)
    - output_dir: output directory
    - genes_source: source of genes table, e.g. ensGene or refGene.
      By default, assumes input is an Ensembl table.
    """
    utils.make_dir(output_dir)
    events_basename = os.path.basename(events_gff_fname)
    events_to_genes_fname = \
      os.path.join(output_dir, "%s_to_%s.txt" \
                   %(events_basename, 
                     genes_source))
    print "Outputting events to genes..."
    print "  - Output file: %s" %(events_to_genes_fname)
    if os.path.isfile(events_to_genes_fname):
        print "Found %s. Skipping.." %(events_to_genes_fname)
        return events_to_genes_fname
    # Load the gene table without parsing the individual genes
    gene_table = tables.GeneTable(gene_tables_dir, genes_source)
    # Create a BED file containing the most inclusive txStart/txEnd
    # for each gene in the table
    bed_coords_fname = output_inclusive_trans_coords(gene_table, 
                                                     output_dir)
    # Intersect the GFF events with this BED file of coordinates 
    # to determine what genes each event overlaps
    intersected_bed_fname = \
      intersect_events_with_bed(events_gff_fname,
                                bed_coords_fname,
                                output_dir)
    # Parse the resulting intersectBed results to get a mapping
    # from events to the genes they map to
    events_to_genes = get_events_to_genes(intersected_bed_fname)
    # Output the result to a file
    with open(events_to_genes_fname, "w") as events_to_genes_out:
        header = "event_id\tgene_id\n"
        events_to_genes_out.write(header)
        for event, genes in events_to_genes.iteritems():
            genes_str = ",".join(genes)
            output_line = "%s\t%s\n" %(event, genes_str)
            events_to_genes_out.write(output_line)
예제 #25
0
def intersect_events_with_genes(events_gff_fname,
                                gene_tables_dir,
                                output_dir,
                                genes_source="ensGene",
                                na_val="NA"):
    """
    Intersect GFF events with a genes table (also in GFF format).

    Computes the outermost transcription start/end bounds for each
    genes and then intersects the GFF events with these bounds.

    Outputs a mapping from event ID to one or more genes IDs that it
    maps to, if the event overlaps an annotated gene.

    - events_gff_fname: GFF events filename
    - gene_tables_dir: Directory with gene tables (created by --init module
      of rnaseqlib)
    - output_dir: output directory
    - genes_source: source of genes table, e.g. ensGene or refGene.
      By default, assumes input is an Ensembl table.
    """
    utils.make_dir(output_dir)
    events_basename = os.path.basename(events_gff_fname)
    events_to_genes_fname = \
      os.path.join(output_dir, "%s_to_%s.txt" \
                   %(events_basename,
                     genes_source))
    print "Outputting events to genes..."
    print "  - Output file: %s" % (events_to_genes_fname)
    if os.path.isfile(events_to_genes_fname):
        print "Found %s. Skipping.." % (events_to_genes_fname)
        return events_to_genes_fname
    # Load the gene table without parsing the individual genes
    gene_table = tables.GeneTable(gene_tables_dir, genes_source)
    # Create a BED file containing the most inclusive txStart/txEnd
    # for each gene in the table
    bed_coords_fname = output_inclusive_trans_coords(gene_table, output_dir)
    # Intersect the GFF events with this BED file of coordinates
    # to determine what genes each event overlaps
    intersected_bed_fname = \
      intersect_events_with_bed(events_gff_fname,
                                bed_coords_fname,
                                output_dir)
    # Parse the resulting intersectBed results to get a mapping
    # from events to the genes they map to
    events_to_genes = get_events_to_genes(intersected_bed_fname)
    # Output the result to a file
    with open(events_to_genes_fname, "w") as events_to_genes_out:
        header = "event_id\tgene_id\n"
        events_to_genes_out.write(header)
        for event, genes in events_to_genes.iteritems():
            genes_str = ",".join(genes)
            output_line = "%s\t%s\n" % (event, genes_str)
            events_to_genes_out.write(output_line)
예제 #26
0
def output_rpkm(sample,
                output_dir,
                settings_info,
                rna_base,
                logger):
    """
    Output RPKM tables for the sample.

    Takes as input:

    - sample: a sample object
    - output_dir: output directory
    - settings_info: settings information
    - rna_base: an RNABase object
    """
    # Output RPKM information for all constitutive exon tables in the
    # in the RNA Base
    print "Outputting RPKM for: %s" %(sample.label)
    rpkm_tables = {}
    for table_name, const_exons in rna_base.tables_to_const_exons.iteritems():
        rpkm_output_filename = "%s.rpkm" %(os.path.join(output_dir,
                                                        table_name))
        rpkm_tables[table_name] = rpkm_output_filename
        if os.path.isfile(rpkm_output_filename):
            logger.info("  - Skipping RPKM output, found %s" %(rpkm_output_filename))
            continue
        # Directory where BAM containing mapping to constitutive
        # exons be stored
        bam2gff_outdir = os.path.join(output_dir,
                                      "bam2gff_const_exons")
        utils.make_dir(bam2gff_outdir)
        # Map reads to GFF of constitutive exons
        # Use the rRNA subtracted BAM file
        exons_bam_fname = exon_utils.map_bam2gff(sample.ribosub_bam_filename,
                                                 const_exons.gff_filename,
                                                 bam2gff_outdir)
        # Compute RPKMs for sample: use number of ribosub mapped reads
        num_mapped = int(sample.qc.qc_results["num_ribosub_mapped"])
        if num_mapped == 0:
            logger.critical("Cannot compute RPKMs since sample %s has 0 " \
                            "mapped reads." %(sample.label))
            sys.exit(1)
        logger.info("Sample %s has %s mapped reads" %(sample.label, num_mapped))
        read_len = settings_info["readlen"]
        logger.info("Outputting RPKM from GFF aligned BAM (table %s)" \
                    %(table_name))
        output_rpkm_from_gff_aligned_bam(exons_bam_fname,
                                         num_mapped,
                                         read_len,
                                         const_exons,
                                         rpkm_output_filename)
    logger.info("Finished outputting RPKM for %s to %s" %(sample.label,
                                                          rpkm_output_filename))
    return rpkm_output_filename
예제 #27
0
def compare(settings, logs_outdir, delay=5, dry_run=False):
    """
    Run a MISO samples comparison between all pairs of samples.
    """
    settings_filename = utils.pathify(settings)
    misowrap_obj = mw.MISOWrap(settings_filename, logs_outdir, logger_label="compare")
    bam_files = misowrap_obj.bam_files
    sample_labels = misowrap_obj.sample_labels
    read_len = misowrap_obj.read_len
    overhang_len = misowrap_obj.overhang_len
    miso_bin_dir = misowrap_obj.miso_bin_dir
    miso_output_dir = misowrap_obj.miso_outdir
    comparison_groups = misowrap_obj.comparison_groups
    comparisons_dir = misowrap_obj.comparisons_dir
    utils.make_dir(comparisons_dir)
    misowrap_obj.logger.info("Running MISO comparisons...")
    ##
    ## Compute comparisons between all pairs
    ## in a sample group
    ##
    for comp_group in comparison_groups:
        sample_pairs = utils.get_pairwise_comparisons(comp_group)
        print "  - Total of %d comparisons" % (len(sample_pairs))
        for sample1, sample2 in sample_pairs:
            # For each pair of samples, compare their output
            # along each event type
            misowrap_obj.logger.info("Comparing %s %s" % (sample1, sample2))
            # Directories for each sample
            sample1_dir = os.path.join(miso_output_dir, sample1)
            sample2_dir = os.path.join(miso_output_dir, sample2)
            for event_type in misowrap_obj.event_types:
                sample1_event_dir = os.path.join(sample1_dir, event_type)
                sample2_event_dir = os.path.join(sample2_dir, event_type)
                job_name = "compare_%s_%s_%s" % (sample1, sample2, event_type)
                event_comparisons_dir = os.path.join(comparisons_dir, event_type)
                compare_cmd = "%s --compare-samples %s %s %s " "--comparison-labels %s %s" % (
                    misowrap_obj.compare_miso_cmd,
                    sample1_event_dir,
                    sample2_event_dir,
                    event_comparisons_dir,
                    sample1,
                    sample2,
                )
                misowrap_obj.logger.info("Executing: %s" % (compare_cmd))
                if misowrap_obj.use_cluster:
                    if not dry_run:
                        misowrap_obj.my_cluster.launch_job(compare_cmd, job_name, ppn=1)
                        time.sleep(delay)
                else:
                    if not dry_run:
                        os.system(compare_cmd)
예제 #28
0
 def run_meme_on_enriched_kmers(self, output_dir,
                                fold_enriched_cutoff=2,
                                method="max",
                                len_to_output=None):
     """
     Run MEME on all enriched kmers.
     """
     self.logger.info("Running MEME on enriched BindnSeq kmers...")
     self.logger.info("  - Output dir: %s" %(output_dir))
     self.logger.info("  - Fold enrichment cutoff: %.1f" %(fold_enriched_cutoff))
     self.logger.info("  - Enrichment method: %s" %(method))
     # Make directory for all the kmer sequences to be
     # processed by MEME
     self.seqs_dir = os.path.join(output_dir, "seqs")
     utils.make_dir(self.seqs_dir)
     # Output all enriched kmers to file
     if len_to_output is None:
         len_to_output = "all"
     self.seqs_fname = \
         os.path.join(self.seqs_dir,
                      "enriched_kmers.cutoff_%.1f.method_%s.%s_kmers.fasta" \
                      %(fold_enriched_cutoff, method, str(len_to_output)))
     self.logger.info("Outputting sequences as FASTA to: %s" %(self.seqs_fname))
     seqs_out = open(self.seqs_fname, "w")
     for kmer_len in [4,5,6]:#self.kmer_lens:
         if len_to_output != "all":
             if len_to_output != kmer_len:
                 print "Skipping %d" %(kmer_len)
                 continue
         odds_ratios = self.odds_ratios[kmer_len]
         # Rank the odds ratios
         ranked_ratios = self.rank_enriched_kmers(odds_ratios)
         # Select only the kmers that meet the cutoff
         enriched_ratios = \
             ranked_ratios[ranked_ratios["rank"] >= fold_enriched_cutoff]
         # Write those to file
         for kmer in enriched_ratios["kmer"].values:
             header = ">%s\n" %(kmer)
             seq = "%s\n" %(kmer)
             seqs_out.write(header)
             seqs_out.write(seq)
     seqs_out.close()
     # Run MEME on FASTA file with kmers
     output_dir = os.path.join(output_dir, "meme_output")
     utils.make_dir(output_dir)
     self.logger.info("Running MEME on enriched BindnSeq kmers...")
     self.logger.info("  - MEME output dir: %s" %(output_dir))
     if len(glob.glob(os.path.join(output_dir, "*"))) >= 1:
         self.logger.info("MEME output exists. Skipping...")
         return
     meme_utils.run_meme(self.logger, self.seqs_fname, output_dir)
예제 #29
0
 def find_motifs_homer(self, output_dir, homer_kmer_lens=[4, 5, 6, 7, 8]):
     """
     Find motifs with Homer.
     """
     output_dir = os.path.join(output_dir, "homer_output")
     utils.make_dir(output_dir)
     params = {"-rna": "", "-len": ",".join(map(str, homer_kmer_lens))}
     # Run on exp
     homer_utils.run_homer(self.logger, self.exp_coords_fname, self.genome,
                           os.path.join(output_dir, "exp"), params)
     # Run on control
     homer_utils.run_homer(self.logger,
                           self.control_coords_fname, self.genome,
                           os.path.join(output_dir, "control"), params)
예제 #30
0
파일: tables.py 프로젝트: hjanime/rnaseqlib
 def init_dirs(self):
     """
     Make sure directories exist.
     """
     utils.make_dir(self.exons_dir)
     utils.make_dir(self.const_exons_dir)
     utils.make_dir(self.introns_dir)
     utils.make_dir(self.utrs_dir)
def merge_events(genome,
                 event_type,
                 splicegraph_events_dir,
                 new_events_dir,
                 output_dir):
    """
    Merge events.
    """
    sg_gff_fname = os.path.join(splicegraph_events_dir,
                                genome,
                                "%s.%s.gff3" %(event_type, genome))
    if not os.path.isfile(sg_gff_fname):
        print "Cannot find %s" %(sg_gff_fname)
        return
    if "_" in event_type:
        new_event_type = event_type.split("_")[0]
    else:
        new_event_type = event_type
    new_gff_fname = os.path.join(new_events_dir,
                                 genome,
                                 "commonshortest",
                                 "%s.%s.gff3" %(new_event_type, genome))
    if not os.path.isfile(new_gff_fname):
        print "Cannot find %s" %(new_gff_fname)
        return
    output_dir = os.path.join(output_dir, genome)
    utils.make_dir(output_dir)
    output_gff_fname = \
        os.path.join(output_dir, "%s.%s.gff3" %(event_type, genome))
    print "Merging %s.." %(event_type)
    print "  - Old: %s" %(sg_gff_fname)
    print "  - New: %s" %(new_gff_fname)
    merge_func = None
    if event_type.startswith("SE"):
        merge_func = merge_se
    elif event_type.startswith("MXE"):
        merge_func = merge_mxe
    elif event_type.startswith("A5SS"):
        merge_func = merge_a5ss
    elif event_type.startswith("A3SS"):
        merge_func = merge_a3ss
    elif event_type.startswith("RI"):
        merge_func = merge_ri
    if merge_func is None:
        raise Exception, "Unrecognized event type %s" %(event_type)
    # Make merge operation
    merge_func(sg_gff_fname, new_gff_fname, output_gff_fname,
               genome)
예제 #32
0
 def __init__(self, sample, pipeline):
     # Pipeline instance that the sample is attached to
     self.pipeline = pipeline
     self.sample = sample
     self.settings_info = pipeline.settings_info
     # Define logger
     self.logger = utils.get_logger("QualityControl.%s" % (sample.label),
                                    self.pipeline.pipeline_outdirs["logs"])
     # QC header: order of QC fields to be outputted
     self.regions_header = [
         "num_ribo", "num_exons", "num_cds", "num_introns", "num_3p_utr",
         "num_5p_utr", "num_tRNAs", "num_junctions"
     ]
     self.qc_stats_header = [
         "percent_mapped", "percent_unique", "percent_ribo",
         "percent_exons", "percent_cds", "percent_introns",
         "percent_3p_utr", "percent_5p_utr", "percent_tRNAs", "3p_to_cds",
         "5p_to_cds", "3p_to_5p", "exon_intron_ratio"
     ]
     self.qc_header = ["num_reads",
                       "num_mapped",
                       "num_ribosub_mapped",
                       "num_unique_mapped"] + \
                       self.qc_stats_header + \
                       self.regions_header
     # QC results
     self.na_val = "NA"
     self.qc_results = defaultdict(lambda: self.na_val)
     # QC output dir
     self.qc_outdir = self.pipeline.pipeline_outdirs["qc"]
     # QC filename for this sample
     self.sample_outdir = os.path.join(self.qc_outdir, self.sample.label)
     utils.make_dir(self.sample_outdir)
     # Regions output dir
     self.regions_outdir = os.path.join(self.sample_outdir, "regions")
     utils.make_dir(self.regions_outdir)
     self.qc_filename = os.path.join(self.sample_outdir,
                                     "%s.qc.txt" % (self.sample.label))
     self.qc_loaded = False
     # use ensGene gene table for QC computations
     self.gene_table = self.pipeline.rna_base.gene_tables["ensGene"]
     # Load QC information if file corresponding to sample
     # already exists
     self.load_qc_from_file()
예제 #33
0
def trim_polyA_ends(fastq_filename,
                    output_dir,
                    compressed=False,
                    min_polyA_len=3,
                    min_read_len=22):
    """
    Trim polyA ends from reads.
    """
    print "Trimming polyA trails from: %s" %(fastq_filename)
    # Strip the trailing extension
    output_basename = \
        ".".join(os.path.basename(fastq_filename).split(".")[0:-1])
    output_basename = "%s.trimmed_polyA.fastq.gz" %(output_basename)
    output_filename = os.path.join(output_dir, output_basename)
    utils.make_dir(output_dir)
    if os.path.isfile(output_filename):
        print "SKIPPING: %s already exists!" %(output_filename)
        return output_filename
    print "  - Outputting trimmed sequences to: %s" %(output_filename)
    input_file = fastq_utils.read_open_fastq(fastq_filename)
    output_file = fastq_utils.write_open_fastq(output_filename)
    t1 = time.time()
    for line in fastq_utils.read_fastq(input_file):
        header, seq, header2, qual = line
        if seq.endswith("A"):
            # Skip sequences that do not end with at least N
            # many As
            if seq[-min_polyA_len:] != ("A" * min_polyA_len):
                continue
            # Get sequence stripped of contiguous strech of polyAs
            stripped_seq = rstrip_stretch(seq, "A")
            if len(stripped_seq) < min_read_len:
                # Skip altogether reads that are shorter than
                # the required length after trimming
                continue
            # Strip the quality scores to match trimmed sequence
            new_qual = qual[0:len(stripped_seq)]
            new_rec = (header, stripped_seq, header2, new_qual)
            # Write the record with trimmed sequence back out to file
            fastq_utils.write_fastq(output_file, new_rec)
    t2 = time.time()
    print "Trimming took %.2f mins." %((t2 - t1)/60.)
    output_file.close()
    return output_filename
예제 #34
0
def trim_polyA_ends(fastq_filename,
                    output_dir,
                    compressed=False,
                    min_polyA_len=3,
                    min_read_len=22):
    """
    Trim polyA ends from reads.
    """
    print "Trimming polyA trails from: %s" % (fastq_filename)
    # Strip the trailing extension
    output_basename = \
        ".".join(os.path.basename(fastq_filename).split(".")[0:-1])
    output_basename = "%s.trimmed_polyA.fastq.gz" % (output_basename)
    output_filename = os.path.join(output_dir, output_basename)
    utils.make_dir(output_dir)
    if os.path.isfile(output_filename):
        print "SKIPPING: %s already exists!" % (output_filename)
        return output_filename
    print "  - Outputting trimmed sequences to: %s" % (output_filename)
    input_file = fastq_utils.read_open_fastq(fastq_filename)
    output_file = fastq_utils.write_open_fastq(output_filename)
    t1 = time.time()
    for line in fastq_utils.read_fastq(input_file):
        header, seq, header2, qual = line
        if seq.endswith("A"):
            # Skip sequences that do not end with at least N
            # many As
            if seq[-min_polyA_len:] != ("A" * min_polyA_len):
                continue
            # Get sequence stripped of contiguous strech of polyAs
            stripped_seq = rstrip_stretch(seq, "A")
            if len(stripped_seq) < min_read_len:
                # Skip altogether reads that are shorter than
                # the required length after trimming
                continue
            # Strip the quality scores to match trimmed sequence
            new_qual = qual[0:len(stripped_seq)]
            new_rec = (header, stripped_seq, header2, new_qual)
            # Write the record with trimmed sequence back out to file
            fastq_utils.write_fastq(output_file, new_rec)
    t2 = time.time()
    print "Trimming took %.2f mins." % ((t2 - t1) / 60.)
    output_file.close()
    return output_filename
예제 #35
0
 def output_filtered_comparisons(
     self,
     output_dir=None,
     sort_column="bayes_factor",
     columns_to_write=[  #"event_name",
         "gene_id", "gene_symbol", "sample1_posterior_mean",
         "sample1_ci_low", "sample1_ci_high", "sample2_posterior_mean",
         "sample2_ci_low", "sample2_ci_high", "diff", "bayes_factor",
         "isoforms", "sample1_counts", "sample1_assigned_counts",
         "sample2_counts", "sample2_assigned_counts", "chrom", "strand",
         "mRNA_starts", "mRNA_ends"
     ]):
     """
     Output filtered comparisons table.
     """
     if output_dir == None:
         output_dir = self.misowrap_obj.comparisons_dir
     # Output each file by event type
     output_dir = os.path.join(output_dir, "filtered_events")
     print "Outputting filtered events..."
     print "  - Output dir: %s" % (output_dir)
     utils.make_dir(output_dir)
     for event_type, filtered_df in self.filtered_events.iteritems():
         curr_output_dir = os.path.join(output_dir, event_type)
         print "Event type: %s" % (event_type)
         # View by comparison
         comparison_labels = \
             utils.unique_list(filtered_df.index.get_level_values(0))
         print "Outputting %d comparisons" % (len(comparison_labels))
         for label in comparison_labels:
             print "Comparison: %s" % (label)
             comparison_output_dir = os.path.join(curr_output_dir, label)
             utils.make_dir(comparison_output_dir)
             output_filename = os.path.join(comparison_output_dir,
                                            "%s.%s.filtered.miso_bf" \
                                            %(label,
                                              event_type))
             print "Outputting to: %s" % (output_filename)
             curr_df = filtered_df.ix[label].sort_index(by=sort_column,
                                                        ascending=False)
             curr_df.to_csv(output_filename,
                            sep=self.delimiter,
                            float_format="%.4f",
                            cols=columns_to_write)
예제 #36
0
 def __init__(self, settings_filename, output_dir,
              logger_label=None):
     self.settings_filename = settings_filename
     self.settings_info = None
     self.logger_label = None
     # Main output directory
     self.output_dir = utils.pathify(output_dir)
     utils.make_dir(self.output_dir)
     # MISO output directory (where raw output is)
     self.miso_outdir = None
     # Comparisons output directory
     self.comparisons_outdir = None
     # BAM files to process
     self.bam_files = None
     # Sample labels
     self.sample_labels = None
     self.comparison_groups = None
     # Insert length directory (for paired-end samples)
     self.insert_lens_dir = None
     # Logs output directory
     self.logs_outdir = None
     # Logger object
     self.logger = None
     # Cluster submission object
     self.my_cluster = None
     # Event types to process
     self.event_types = None
     # Whether to submit jobs to cluster
     self.use_cluster = False
     # run_miso cmd
     self.run_miso_cmd = None
     # run_events_analysis cmd
     self.run_events_cmd = None
     # Constitutive exons GFF file: used to compute
     # the insert length distribution
     self.const_exons_gff = None
     # Load settings
     self.load_settings()
     ##
     ## Load annotation of events, like a map
     ## events to genes.
     ##
     self.events_to_genes = None
     self.load_events_to_genes()
예제 #37
0
def index_merged_events():
    event_types = ["SE", "SE_shortest_noAceView", "MXE", "A3SS", "A5SS", "RI"]
    genomes = ["mm9", "hg18", "hg19"]
    for genome in genomes:
        for event_type in event_types:
            gff_fname = \
                os.path.join(MERGED_EVENTS_DIR, genome,
                             "%s.%s.gff3" %(event_type, genome))
            output_dir = \
                os.path.join(MERGED_EVENTS_DIR, "pickled", genome, event_type)
            if not os.path.isdir(output_dir):
                utils.make_dir(output_dir)
            if not os.path.isfile(gff_fname):
                print "Cannot find %s" % (gff_fname)
                continue
            cmd = "index_gff --index %s %s" % (gff_fname, output_dir)
            ret_val = os.system(cmd)
            if ret_val != 0:
                raise Exception, "Failed to index %s" % (gff_fname)
예제 #38
0
 def __init__(self, settings_filename, output_dir, logger_label=None):
     self.settings_filename = settings_filename
     self.settings_info = None
     self.logger_label = None
     # Main output directory
     self.output_dir = utils.pathify(output_dir)
     utils.make_dir(self.output_dir)
     # MISO output directory (where raw output is)
     self.miso_outdir = None
     # Comparisons output directory
     self.comparisons_outdir = None
     # BAM files to process
     self.bam_files = None
     # Sample labels
     self.sample_labels = None
     self.comparison_groups = None
     # Insert length directory (for paired-end samples)
     self.insert_lens_dir = None
     # Logs output directory
     self.logs_outdir = None
     # Logger object
     self.logger = None
     # Cluster submission object
     self.my_cluster = None
     # Event types to process
     self.event_types = None
     # Whether to submit jobs to cluster
     self.use_cluster = False
     # run_miso cmd
     self.run_miso_cmd = None
     # run_events_analysis cmd
     self.run_events_cmd = None
     # Constitutive exons GFF file: used to compute
     # the insert length distribution
     self.const_exons_gff = None
     # Load settings
     self.load_settings()
     ##
     ## Load annotation of events, like a map
     ## events to genes.
     ##
     self.events_to_genes = None
     self.load_events_to_genes()
예제 #39
0
def index_merged_events():
    event_types = ["SE", "SE_shortest_noAceView", "MXE", "A3SS", "A5SS", "RI"]
    genomes = ["mm9", "hg18", "hg19"]
    for genome in genomes:
        for event_type in event_types:
            gff_fname = \
                os.path.join(MERGED_EVENTS_DIR, genome,
                             "%s.%s.gff3" %(event_type, genome))
            output_dir = \
                os.path.join(MERGED_EVENTS_DIR, "pickled", genome, event_type)
            if not os.path.isdir(output_dir):
                utils.make_dir(output_dir)
            if not os.path.isfile(gff_fname):
                print "Cannot find %s" %(gff_fname)
                continue
            cmd = "index_gff --index %s %s" %(gff_fname, output_dir)
            ret_val = os.system(cmd)
            if ret_val != 0:
                raise Exception, "Failed to index %s" %(gff_fname)
예제 #40
0
 def __init__(self, event_ids, label, input_seqs_fname,
              remove_repeats=False,
              entry_types=None,
              output_dir=None):
     self.event_ids = event_ids
     self.label = label
     self.entry_types = entry_types
     self.output_dir = output_dir
     self.input_seqs_fname = input_seqs_fname
     # Whether to remove repeats or not from sequences
     self.remove_repeats = remove_repeats
     utils.make_dir(output_dir)
     # Sequence filenames for each entry type
     self.seqs_fnames = {}
     # BED filenames for each entry type
     self.bed_fnames = {}
     # Total length of sequences
     self.total_lens = {}
     self.output_event_seqs_and_coords()
예제 #41
0
파일: RNABase.py 프로젝트: 1eesh/rnaseqlib
 def build_indices(self):
     """
     Build relevant genome indices for use with
     Bowtie/Tophat.
     """
     if not self.with_index:
         print "Not building indices."
         return
     print "Building indices.."
     fasta_files = self.get_bowtie_index_fasta_files()
     num_files = len(fasta_files)
     if num_files == 0:
         print "WARNING: No FASTA files to build index from."
         return
     self.indices_dir = os.path.join(self.output_dir, "indices")
     utils.make_dir(self.indices_dir)
     ##
     ## Check if the Bowtie index is already present, if so skip
     ##
     # Check for Bowtie 1 indices
     indices = glob.glob(os.path.join(self.indices_dir,
                                      "%s*.ebwt" %(self.genome)))
     # Check for Bowtie 2 indices
     indices += glob.glob(os.path.join(self.indices_dir,
                                       "%s*.bt2"))
     if len(indices) >= 1:
         print "Found Bowtie index files in %s. Skipping index build.." \
             %(self.indices_dir)
         return
     print "Building Bowtie index from %d files" %(num_files)
     for fasta_fname in fasta_files:
         print " - %s" %(os.path.basename(fasta_fname))
     fasta_str = ",".join(fasta_files)
     # Change to indices directory
     os.chdir(self.indices_dir)
     # Use the genome as basename for the bowtie index
     bowtie_build_cmd = "bowtie-build %s %s" %(fasta_str,
                                               self.genome)
     t1 = time.time()
     os.system(bowtie_build_cmd)
     t2 = time.time()
     print "Bowtie build took %.2f minutes" %((t2 - t1) / 60.)
예제 #42
0
def jf_count_kmers(fastx_fname, kmer_len,
                   output_dir,
                   hash_size=100000000):
    """
    Count kmers using jellyfish.
    """
    if not os.path.isfile(fastx_fname):
        print "Error: fastx file %s not found." %(fastx_fname)
        sys.exit(1)
    # Count kmers, use temporary file for db
    fastx_basename = os.path.basename(fastx_fname)
    output_dir = os.path.join(output_dir, "jf_counts")
    utils.make_dir(output_dir)
    db_fname = "%s.jf" %(os.path.join(output_dir, fastx_basename))
    output_fname = "%s_counts" %(db_fname)
    if os.path.isfile(db_fname):
        #print "Overwriting %s" %(db_fname)
        os.remove(db_fname)
    if os.path.isfile(output_fname):
        #print "Overwriting %s" %(output_fname)
        os.remove(output_fname)
    count_cmd = "%s count -m %d -o %s -s %d %s" \
        %(jf_path,
          kmer_len,
          db_fname,
          hash_size,
          fastx_fname)
    #print "Counting kmers with jf: %s" %(count_cmd)
    ret_val = os.system(count_cmd)
    if ret_val != 0:
        raise Exception, "jellyfish count call failed."
        sys.exit(1)
    # Merge db results
    merged_fname = jf_merge(db_fname)
    # Load up kmer results
    dump_cmd = "%s dump -o %s %s" \
        %(jf_path,
          output_fname,
          merged_fname)
    ret_val = os.system(dump_cmd)
    return output_fname
def merge_events(genome, event_type, splicegraph_events_dir, new_events_dir,
                 output_dir):
    """
    Merge events.
    """
    sg_gff_fname = os.path.join(splicegraph_events_dir, genome,
                                "%s.%s.gff3" % (event_type, genome))
    if not os.path.isfile(sg_gff_fname):
        print "Cannot find %s" % (sg_gff_fname)
        return
    if "_" in event_type:
        new_event_type = event_type.split("_")[0]
    else:
        new_event_type = event_type
    new_gff_fname = os.path.join(new_events_dir, genome, "commonshortest",
                                 "%s.%s.gff3" % (new_event_type, genome))
    if not os.path.isfile(new_gff_fname):
        print "Cannot find %s" % (new_gff_fname)
        return
    output_dir = os.path.join(output_dir, genome)
    utils.make_dir(output_dir)
    output_gff_fname = \
        os.path.join(output_dir, "%s.%s.gff3" %(event_type, genome))
    print "Merging %s.." % (event_type)
    print "  - Old: %s" % (sg_gff_fname)
    print "  - New: %s" % (new_gff_fname)
    merge_func = None
    if event_type.startswith("SE"):
        merge_func = merge_se
    elif event_type.startswith("MXE"):
        merge_func = merge_mxe
    elif event_type.startswith("A5SS"):
        merge_func = merge_a5ss
    elif event_type.startswith("A3SS"):
        merge_func = merge_a3ss
    elif event_type.startswith("RI"):
        merge_func = merge_ri
    if merge_func is None:
        raise Exception, "Unrecognized event type %s" % (event_type)
    # Make merge operation
    merge_func(sg_gff_fname, new_gff_fname, output_gff_fname, genome)
예제 #44
0
def download_genome_seq(genome,
                        output_dir):
    """
    Download genome sequence files from UCSC.
    """
    print "Downloading genome sequence files for %s" %(genome)
    print "  - Output dir: %s" %(output_dir)
    output_dir = os.path.join(output_dir, "genome")
    if os.path.isdir(output_dir):
        dir_files = os.listdir(output_dir)
        if len(dir_files) >= 1:
            print "Directory %s exists and contains files; skipping download of genome..." \
                %(output_dir)
            return None
    utils.make_dir(output_dir)
    # Change to output directory
    os.chdir(output_dir)
    ##
    ## Download the genome sequence files
    ##
    genome_url = "%s/%s/chromosomes/" %(UCSC_GOLDENPATH_FTP,
                                        genome)
    # Fetch all chromosome sequence files
    download_utils.wget(os.path.join(genome_url, "*"))
    # Download only chrom17 / chr13 random
    #download_utils.wget(os.path.join(genome_url, "chr17.fa.gz"))    
    #download_utils.wget(os.path.join(genome_url, "chr13_random.fa.gz"))
    # Remove random chromosome contigs
    for fname in glob.glob(os.path.join(output_dir, "*.fa.gz")):
        if "_" in os.path.basename(fname):
            print "Deleting: %s" %(fname)
            os.remove(fname)
    ##
    ## Uncompress the files
    ##
    print "Uncompressing files..."
    uncompress_cmd = "gunzip %s/*.gz" %(output_dir)
    t1 = time.time()
    os.system(uncompress_cmd)
    t2 = time.time()
    print "Uncompressing took %.2f minutes" %((t2 - t1)/60.)
예제 #45
0
파일: MotifSet.py 프로젝트: 1eesh/rnaseqlib
 def find_motifs_homer(self, output_dir,
                       homer_kmer_lens=[4,5,6,7,8]):
     """
     Find motifs with Homer.
     """
     output_dir = os.path.join(output_dir, "homer_output")
     utils.make_dir(output_dir)
     params = {"-rna": "",
               "-len": ",".join(map(str, homer_kmer_lens))}
     # Run on exp
     homer_utils.run_homer(self.logger,
                           self.exp_coords_fname,
                           self.genome,
                           os.path.join(output_dir, "exp"),
                           params)
     # Run on control
     homer_utils.run_homer(self.logger,
                           self.control_coords_fname,
                           self.genome,
                           os.path.join(output_dir, "control"),
                           params)
예제 #46
0
    def init_outdirs(self):
        """
        Create the output directories for the pipeline.

        Structure is:

        output_dir
          - rawdata: trimmed reads, etc.
          - mapping: mapped data files
          - qc: quality control output
          - analysis: analysis output
        """
        print "Initializing the pipeline output directories."
        utils.make_dir(self.output_dir)
        # Subdirectories of toplevel subdirs
        self.toplevel_subdirs = defaultdict(list)
        self.toplevel_subdirs["analysis"] = ["rpkm", "insert_lens"]
        for dirname in self.toplevel_dirs:
            dirpath = os.path.join(self.output_dir, dirname)
            print " - Creating: %s" % (dirpath)
            utils.make_dir(dirpath)
            self.pipeline_outdirs[dirname] = dirpath
            for subdir_name in self.toplevel_subdirs[dirname]:
                subdir_path = os.path.join(dirpath, subdir_name)
                utils.make_dir(subdir_path)
        # Variables storing commonly accessed directories
        self.rpkm_dir = os.path.join(self.pipeline_outdirs["analysis"], "rpkm")
예제 #47
0
def download_misc_seqs(genome, output_dir):
    """
    Download assorted sequences related to genome.
    """
    # Mapping from sequence label (e.g. rRNA)
    # to accession numbers
    organism = None
    if genome.startswith("hg"):
        organism = "human"
    elif genome.startswith("mm"):
        organism = "mouse"
    else:
        print "Error: Unsupported genome."
        sys.exit(1)
    # Fetch the accession numbers for the organism's
    # misc sequences and download them
    misc_seqs = NCBI_MISC_SEQS[organism]
    ncbi_outdir = os.path.join(output_dir, "ncbi")
    misc_outdir = os.path.join(output_dir, "misc")
    utils.make_dir(ncbi_outdir)
    utils.make_dir(misc_outdir)
    for seq_label, access_id in misc_seqs.iteritems():
        if access_id is None:
            continue
        output_filename = os.path.join(misc_outdir, "%s.fa" %(seq_label))
        if os.path.isfile(output_filename):
            print "%s exists. Skipping download.." %(seq_label)
            continue
        print "Downloading: %s (NCBI: %s)" %(seq_label,
                                             access_id)
        url_filename = download_ncbi_fasta(access_id, ncbi_outdir)
        fasta_in = fasta_utils.read_fasta(url_filename)
        fasta_out = open(output_filename, "w")
        print "  - Writing to: %s" %(output_filename)
        # Fetch first FASTA record
        rec = fasta_in.next()
        curr_label, fasta_seq = rec
        # Output it with the required label
        new_rec = (">%s" %(seq_label), fasta_seq)
        fasta_utils.write_fasta(fasta_out, [new_rec])
예제 #48
0
 def build_indices(self):
     """
     Build relevant genome indices for use with
     Bowtie/Tophat.
     """
     if not self.with_index:
         print "Not building indices."
         return
     print "Building indices.."
     fasta_files = self.get_bowtie_index_fasta_files()
     num_files = len(fasta_files)
     if num_files == 0:
         print "WARNING: No FASTA files to build index from."
         return
     self.indices_dir = os.path.join(self.output_dir, "indices")
     utils.make_dir(self.indices_dir)
     ##
     ## Check if the Bowtie index is already present, if so skip
     ##
     # Check for Bowtie 1 indices
     indices = glob.glob(
         os.path.join(self.indices_dir, "%s*.ebwt" % (self.genome)))
     # Check for Bowtie 2 indices
     indices += glob.glob(os.path.join(self.indices_dir, "%s*.bt2"))
     if len(indices) >= 1:
         print "Found Bowtie index files in %s. Skipping index build.." \
             %(self.indices_dir)
         return
     print "Building Bowtie index from %d files" % (num_files)
     for fasta_fname in fasta_files:
         print " - %s" % (os.path.basename(fasta_fname))
     fasta_str = ",".join(fasta_files)
     # Change to indices directory
     os.chdir(self.indices_dir)
     # Use the genome as basename for the bowtie index
     bowtie_build_cmd = "bowtie-build %s %s" % (fasta_str, self.genome)
     t1 = time.time()
     os.system(bowtie_build_cmd)
     t2 = time.time()
     print "Bowtie build took %.2f minutes" % ((t2 - t1) / 60.)
예제 #49
0
def jf_count_kmers(fastx_fname, kmer_len, output_dir, hash_size=100000000):
    """
    Count kmers using jellyfish.
    """
    if not os.path.isfile(fastx_fname):
        print "Error: fastx file %s not found." % (fastx_fname)
        sys.exit(1)
    # Count kmers, use temporary file for db
    fastx_basename = os.path.basename(fastx_fname)
    output_dir = os.path.join(output_dir, "jf_counts")
    utils.make_dir(output_dir)
    db_fname = "%s.jf" % (os.path.join(output_dir, fastx_basename))
    output_fname = "%s_counts" % (db_fname)
    if os.path.isfile(db_fname):
        #print "Overwriting %s" %(db_fname)
        os.remove(db_fname)
    if os.path.isfile(output_fname):
        #print "Overwriting %s" %(output_fname)
        os.remove(output_fname)
    count_cmd = "%s count -m %d -o %s -s %d %s" \
        %(jf_path,
          kmer_len,
          db_fname,
          hash_size,
          fastx_fname)
    #print "Counting kmers with jf: %s" %(count_cmd)
    ret_val = os.system(count_cmd)
    if ret_val != 0:
        raise Exception, "jellyfish count call failed."
        sys.exit(1)
    # Merge db results
    merged_fname = jf_merge(db_fname)
    # Load up kmer results
    dump_cmd = "%s dump -o %s %s" \
        %(jf_path,
          output_fname,
          merged_fname)
    ret_val = os.system(dump_cmd)
    return output_fname
예제 #50
0
def launchJob(cmd,
              job_name,
              scriptOptions,
              output_dir,
              verbose=False,
              test=False,
              ppn="4",
              queue_type="normal"):
    """
    Submits a job on the cluster which will run command 'cmd',
    with options 'scriptOptions'

    Optionally:
    verbose: output the job script
    test: don't actually submit the job script
          (usually used in conjunction with verbose)

    Returns a job ID if the job was submitted properly
    """
    if type(cmd) not in [type(list()), type(tuple())]:
        cmd = [cmd]

    scriptOptions.setdefault("workingdir", os.getcwd())
    scriptOptions.setdefault("ppn", str(ppn))
    scriptOptions.setdefault("scriptuser", getpass.getuser())
    scriptOptions.setdefault("jobname", job_name)
    # remove queue name option
    #scriptOptions.setdefault("queue", queue_type)
    scriptOptions.setdefault("outdir", output_dir)

    scriptOptions["command"] = " ".join(cmd)

    if verbose:
        print "==SUBMITTING TO CLUSTER=="
        print cmd
        print scriptOptions

    pid = os.getpid()
    outscriptName = "%s.%i" % (scriptOptions["jobname"], pid)

    script_outdir = os.path.join(scriptOptions["outdir"], "cluster_scripts")
    utils.make_dir(script_outdir)
    scriptOptions["outf"] = \
        os.path.abspath(os.path.join(script_outdir,
                                     outscriptName+".out"))
    outtext = """#!/bin/sh

    #BSUB -n %(ppn)s 
    #BSUB -R "rusage[mem=800]"
    #BSUB -o %(outf)s 
    #BSUB -J %(jobname)s

    echo Working directory is %(workingdir)s
    cd %(workingdir)s

    echo "%(command)s"
    %(command)s
    echo "===== %(command)s finished =====" """ % scriptOptions

    if verbose:
        print outscriptName

    call = "bsub "

    if not test:
        try:
            qsub = subprocess.Popen(call,
                                    shell=True,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    stdin=subprocess.PIPE)
            print "Executing: ", scriptOptions["command"]
            qsub.stdin.write(outtext)

            output = qsub.communicate()
            if "is submitted to" in output[0]:
                jobID = int(output[0].strip().split()[1][1:-1])
                print "Process launched with job ID:", jobID
                return jobID
            else:
                raise Exception("Failed to launch job '%s': %s" \
                                %(outscriptName, str(output)))
        except:
            print "failing..."
            raise
    return None
예제 #51
0
def download_genome_seq(genome,
                        output_dir):
    """
    Download genome sequence files from UCSC.
    """
    print "Downloading genome sequence files for %s" %(genome)
    print "  - Output dir: %s" %(output_dir)
    output_dir = utils.pathify(os.path.join(output_dir, "genome"))
    utils.make_dir(output_dir)
    dir_files = os.listdir(output_dir)
    # Change to output directory
    os.chdir(output_dir)
    ##
    ## Download the genome sequence files
    ##
    genome_url = "%s/%s/chromosomes/" %(UCSC_GOLDENPATH_FTP,
                                        genome)
    # Fetch all chromosome sequence files
    if len(dir_files) >= 1:
        print "Directory %s exists and contains files; " \
              "skipping download of genome..." \
              %(output_dir)
    else:
        download_utils.wget(os.path.join(genome_url, "*"))
        # Remove random chromosome contigs
        for fname in glob.glob(os.path.join(output_dir, "*.fa.gz")):
            if "_" in os.path.basename(fname):
                print "Deleting: %s" %(fname)
                os.remove(fname)
        ##
        ## Uncompress the files
        ##
        print "Uncompressing files..."
        uncompress_cmd = "gunzip %s/*.gz" %(output_dir)
        print "  - Uncompress cmd: %s" %(uncompress_cmd)
        t1 = time.time()
        ret_val = os.system(uncompress_cmd)
        if ret_val != 0:
            print "Error: Cannot uncompress files in %s" %(output_dir)
            sys.exit(1)
        t2 = time.time()
        print "Uncompressing took %.2f minutes" %((t2 - t1)/60.)
    # Create a single genome FASTA file by concatenating the
    # chromosomes together
    genome_output_fname = \
        os.path.join(output_dir, "%s.fa" %(genome))
    if not os.path.isfile(genome_output_fname):
        print "Concatenating genome chromosomes into one file..."
        print "  - Output file: %s" %(genome_output_fname)
        t1 = time.time()
        concat_chrom_cmd = "cat %s/*.fa > %s" %(output_dir,
                                                genome_output_fname)
        print "  - Concat cmd: %s" %(concat_chrom_cmd)
        ret_val = os.system(concat_chrom_cmd)
        if ret_val != 0:
            print "Error: Could not concatenate genome chromosomes."
            sys.exit(1)
        # Create an index for resulting genome file
        print "Indexing genome file..."
        samtools_index_cmd = "samtools faidx %s" %(genome_output_fname)
        print "  - Index cmd: %s" %(samtools_index_cmd)
        ret_val = os.system(samtools_index_cmd)
        if ret_val != 0:
            print "Error: Could not index genome file."
            sys.exit(1)
        t2 = time.time()
        print "Concatenation and indexing took %.2f minutes" \
            %((t2 - t1)/60.)
예제 #52
0
def fetch_seq_from_gff(gff_fname, fasta_fname, output_dir,
                       with_flanking_introns=False,
                       flanking_introns_coords=None,
                       overwrite=True,
                       entries_to_include=["gene",
                                           "mRNA",
                                           "exon"]):
    """
    Fetch sequence from GFF file.

    Outputs:

    (1) GFF file containing an annotation of the sequences.

    (2) FASTA file with the actual sequences.

    If asked, fetch the flanking intronic sequences.

    Flanking regions are marked below:

      U: region of upstream intron
      D: region of downstream intron

             U           D

    [ U P ]-----[ S E ]-----[ D N ]

            a,b         c,d

    a,b,c,d correspond to optional flanking intron coordinates
    that determine the regions of the upstream/downstream
    introns that should be fetched:

       a, b: negative ints, position relative to 5' splice site of SE
             a < b

       c, d: positive ints, position relative to 3' splice site of SE
             c < d
    """
    # Load GFF genes
    gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_fname,
                                        reverse_recs=True)
    file_basename = re.sub("\.gff3?", "",
                           os.path.basename(gff_fname))
    output_basename = "%s.event_seqs" %(file_basename)
    if flanking_introns_coords is not None:
        output_basename = "%s.flank_intronic_%s_%s_%s_%s" \
            %(output_basename,
              flanking_introns_coords[0],
              flanking_introns_coords[1],
              flanking_introns_coords[2],
              flanking_introns_coords[3])
    gff_outdir = os.path.join(output_dir, "gff_coords")
    utils.make_dir(gff_outdir)
    gff_output_fname = os.path.join(gff_outdir, "%s.gff" %(output_basename))
    fasta_output_fname = os.path.join(output_dir, "%s.fa" %(output_basename))
    if not overwrite:
        if os.path.isfile(fasta_output_fname):
            print "Output file %s exists. Skipping..." %(fasta_output_fname)
            return fasta_output_fname
    print "Outputting GFF coordinates to: %s" %(gff_output_fname)
    if os.path.isfile(gff_output_fname):
        print "  - Overwriting existing file"
    print "Outputting sequences to: %s" %(fasta_output_fname)
    if os.path.isfile(fasta_output_fname):
        print "  - Overwriting existing file"
    genes = gene_utils.load_genes_from_gff(gff_fname)
    gff_out_file = open(gff_output_fname, "w")
    gff_out = miso_gff_utils.Writer(gff_out_file)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        # GFF records to write for the current gene
        recs_to_write = []
        # For mRNA entries, extract the flanking introns of the
        # alternative exon if asked
        event_recs = get_event_recs_from_gene(gene_obj, gene_tree)
        long_mRNA_id = event_recs["long_mRNA"].get_id()
        if event_recs is None:
            continue
        # Write out up, se, and dn exons
        recs_to_write.extend([event_recs["up_exon"]["record"],
                              event_recs["se_exon"]["record"],
                              event_recs["dn_exon"]["record"]])
        if with_flanking_introns:
            introns_coords = \
                get_flanking_introns_coords(gene_obj)
            if introns_coords == None:
                raise Exception, "Cannot find flanking introns coordinates."
                sys.exit(1)
            # Fetch upstream intron sequence
            up_intron_start, up_intron_end = \
                introns_coords["up_intron"]
            up_intron_len = up_intron_end - up_intron_start + 1
            # Fetch downstream intron sequence
            dn_intron_start, dn_intron_end = \
                introns_coords["dn_intron"]
            dn_intron_len = dn_intron_end - dn_intron_start + 1
            # If given custom coordinates, use them instead of entire up/down
            # flanking intronic coordinates.
            se_exon_rec = event_recs["se_exon"]["record"]
            if flanking_introns_coords is not None:
                # (start,end) of upstream intron sequence
                a, b = \
                    int(flanking_introns_coords[0]), int(flanking_introns_coords[1])
                c, d = \
                    int(flanking_introns_coords[2]), int(flanking_introns_coords[3])
                a, b, c, d = error_check_intronic_coords(a, b, c, d,
                                                         up_intron_len, dn_intron_len)
                # Coordinates relative to 5' splice site of sequence to be fetched
                # The start of upstream intron sequence is negative from the 5' ss
                up_intron_start = se_exon_rec.start + a
                up_intron_end = se_exon_rec.start + b
                dn_intron_start = se_exon_rec.end + c
                dn_intron_end = se_exon_rec.end + d
            # Make GFF records for up/dn intronic sequences
            chrom = se_exon_rec.seqid
            source = se_exon_rec.source
            rec_type = "intron"
            strand = se_exon_rec.strand
            up_intron_str = "%s.up_intron" %(long_mRNA_id)
            up_intron_rec = \
                miso_gff_utils.GFF(chrom, source, "intron",
                              up_intron_start, up_intron_end,
                              strand=strand,
                              attributes={"ID": [up_intron_str],
                                          "Parent": [gene_obj.label]})
            dn_intron_str = "%s.dn_intron" %(long_mRNA_id)
            dn_intron_rec = \
                miso_gff_utils.GFF(chrom, source, "intron",
                                   dn_intron_start, dn_intron_end,
                                   strand=strand,
                                   attributes={"ID": [dn_intron_str],
                                               "Parent": [gene_obj.label]})
            recs_to_write.append(up_intron_rec)
            recs_to_write.append(dn_intron_rec)
        # Write out records to GFF
        for rec in recs_to_write:
            gff_out.write(rec)
    gff_out_file.close()
    # Output FASTA sequences
    output_fasta_seqs_from_gff(gff_output_fname,
                               fasta_fname,
                               fasta_output_fname)
    return fasta_output_fname
예제 #53
0
def filter_comparisons(fname, output_dir,
                       event_type=None,
                       atleast_inc=None,
                       atleast_exc=None,
                       atleast_sum=None,
                       gene_table=None,
                       gene_id_cols=["ensg_id", "gsymbol"],
                       dry_run=False):
    """
    Filter a MISO comparison file (*.miso_bf)
    Annotate a GFF file with useful information. For now, add annotation
    of gene IDs based on an input GFF annotation of genes.

    Computes the most inclusive transcription start/end coordinates
    fonr each gene, and then uses pybedtools to intersect (in strand-specific 
    manner) with the input annotation.
    """
    fname = utils.pathify(fname)
    output_dir = utils.pathify(output_dir)
    print "Filtering MISO comparisons file..."
    print "  - MISO comparisons: %s" %(fname)
    print "  - Event type: %s" %(event_type)
    if event_type is not None:
        output_dir = os.path.join(output_dir, event_type)
    utils.make_dir(output_dir)
    print " - Output dir: %s" %(output_dir)
    if "UTR" in event_type:
        def_atleast_inc = tandemutr_atleast_inc
        def_atleast_exc = tandemutr_atleast_exc
        def_atleast_sum = tandemutr_atleast_sum
    elif "SE" in event_type:
        def_atleast_inc = se_atleast_inc
        def_atleast_exc = se_atleast_exc
        def_atleast_sum = se_atleast_sum
    elif "AFE" in event_type:
        def_atleast_inc = afe_atleast_inc
        def_atleast_exc = afe_atleast_exc
        def_atleast_sum = afe_atleast_sum
    elif "ALE" in event_type:
        def_atleast_inc = ale_atleast_inc
        def_atleast_exc = ale_atleast_exc
        def_atleast_sum = ale_atleast_sum
    elif "RI" in event_type:
        def_atleast_inc = ri_atleast_inc
        def_atleast_exc = ri_atleast_exc
        def_atleast_sum = ri_atleast_sum
    else:
        def_atleast_inc = 0
        def_atleast_exc = 0
        def_atleast_sum = 0
    # If read count filters are not given, use the default
    if atleast_inc is None:
        atleast_inc = def_atleast_inc
    if atleast_exc is None:
        atleast_exc = def_atleast_exc
    if atleast_sum is None:
        atleast_sum = def_atleast_sum
    # Filter the events file
    if not os.path.isfile(fname):
        print "Error: Cannot find MISO comparisons file %s" %(fname)
        sys.exit(1)
    if not fname.endswith(".miso_bf"):
        print "Warning: MISO comparisons file %s does not end in " \
              ".miso_bf.  Are you sure it is a comparisons file?" \
              %(fname)
    # Filter comparisons
    # ...
    filtered_df = None
            comparison_counts = \
                self.load_comparisons_counts_from_df(comparisons_df[event_type])
            # Get counts for each read class for sample 1 and sample 2
            comparison_counts = \
                miso_utils.get_counts_by_class("sample1_counts_int",
                                               "sample1",
                                               comparison_counts)
            comparison_counts = \
                miso_utils.get_counts_by_class("sample2_counts_int",
                                               "sample2",
                                               comparison_counts)
            filtered_df = comparison_counts
            # Filter exclusion reads
            # Only apply this to events other than TandemUTRs!
            if "TandemUTR" in event_type:
                atleast_exc = 0
                atleast_const = 5
            # Filter inclusion reads
            filtered_df = \
                filtered_df[filtered_df["sample1_inc_counts"] \
                            | filtered_df["sample2_inc_counts"] \
                            >= atleast_inc]
            # Filter exclusion reads
            filtered_df = \
                filtered_df[filtered_df["sample1_exc_counts"] \
                            | filtered_df["sample2_exc_counts"] \
                            >= atleast_exc]
            # Filter the sum of inclusion and exclusion reads
            sample1_sum = \
                filtered_df["sample1_inc_counts"] + \
                filtered_df["sample1_exc_counts"]
            sample2_sum = \
                filtered_df["sample2_inc_counts"] + \
                filtered_df["sample2_exc_counts"]
            filtered_df = \
                filtered_df[sample1_sum | sample2_sum >= atleast_sum]
            # Filter constitutive reads
            filtered_df = \
                filtered_df[filtered_df["sample1_const_counts"] \
                            | filtered_df["sample2_const_counts"] \
                            >= atleast_const]
            self.filtered_events[event_type] = filtered_df
예제 #54
0
    ],
    "hg19": ["SE", "TandemUTR", "A3SS", "A5SS", "ALE", "AFE", "MXE", "RI"]
}

# Gene tables indexed by genome
gene_tables = {
    "mm9": "/home/yarden/jaen/pipeline_init/mm9/ucsc/",
    "mm10": "/home/yarden/jaen/pipeline_init/mm9/ucsc/",
    "hg18": "/home/yarden/jaen/pipeline_init/hg18/ucsc/",
    "hg19": "/home/yarden/jaen/pipeline_init/hg19/ucsc/"
}

intersect_events = "intersect_events.py"
events_dir = "/home/yarden/jaen/gff-events"
events_outdir = os.path.join(events_dir, "annotated_events")
utils.make_dir(events_outdir)

for genome, events in genomes_to_events.iteritems():
    print "Processing genome %s" % (genome)
    curr_outdir = os.path.join(events_outdir, genome)
    print "  - Output dir: %s" % (curr_outdir)
    for event in events:
        if ("AceView" in event) or ("3pseq" in event):
            continue
        print "Intersecting %s.." % (event)
        events_fname = os.path.join(events_dir, genome,
                                    "%s.%s.gff3" % (event, genome))
        if not os.path.isfile(events_fname):
            raise Exception, "%s does not exist." % (events_fname)
        print "  - Events file: %s" % (events_fname)
        cmd = "%s --intersect %s %s --output-dir %s" \
예제 #55
0
 def load_settings(self):
     """
     Load settings for misowrap.
     """
     settings_info, parsed_settings = \
           misowrap_settings.load_misowrap_settings(self.settings_filename)
     self.settings_info = settings_info
     # Load basic settings about data
     self.read_len = self.settings_info["settings"]["readlen"]
     self.overhang_len = self.settings_info["settings"]["overhanglen"]
     self.miso_bin_dir = \
       utils.pathify(self.settings_info["settings"]["miso_bin_dir"])
     self.miso_settings_filename = \
       utils.pathify(self.settings_info["settings"]["miso_settings_filename"])
     self.miso_events_dir = \
       utils.pathify(self.settings_info["settings"]["miso_events_dir"])
     self.miso_outdir = \
       utils.pathify(self.settings_info["settings"]["miso_output_dir"])
     # Load data-related parameters
     self.bam_files = self.settings_info["data"]["bam_files"]
     if "insert_lens_dir" in self.settings_info["data"]:
         self.insert_lens_dir = \
           utils.pathify(self.settings_info["data"]["insert_lens_dir"])
     # Sample labels
     self.sample_labels = self.settings_info["data"]["sample_labels"]
     # Set output directories
     self.comparisons_dir = os.path.join(self.output_dir, "comparisons")
     self.comparison_groups = \
         self.settings_info["data"]["comparison_groups"]
     self.logs_outdir = os.path.join(self.output_dir, "misowrap_logs")
     # Create necessary directories
     utils.make_dir(self.logs_outdir)
     if "cluster_type" in self.settings_info["settings"]:
         self.use_cluster = True
         self.cluster_type = \
             self.settings_info["settings"]["cluster_type"]
         self.chunk_jobs = \
             self.settings_info["settings"]["chunk_jobs"]
     if self.use_cluster:
         print "Loading cluster information."
         # Load cluster object if given a cluster type
         self.load_cluster()
     # Create a logger object
     if self.logger_label is None:
         self.logger_label = "misowrap"
     else:
         self.logger_label = "misowrap_%s" % (logger_label)
     self.logger = utils.get_logger(self.logger_label, self.logs_outdir)
     # Whether to prefilter MISO events
     # Set general default settings
     if "prefilter_miso" not in settings_info["settings"]:
         # By default, set it so that MISO events are not
         # prefiltered
         settings_info["settings"]["prefilter_miso"] = False
     self.prefilter_miso = \
         self.settings_info["settings"]["prefilter_miso"]
     # Load event types
     self.load_event_types()
     # Set path to MISO scripts
     self.compare_miso_cmd = os.path.join(self.miso_bin_dir, "compare_miso")
     self.summarize_miso_cmd = os.path.join(self.miso_bin_dir,
                                            "summarize_miso")
     self.run_events_cmd = os.path.join(self.miso_bin_dir, "miso")
     self.pe_utils_cmd = os.path.join(self.miso_bin_dir, "pe_utils")
     # Files related to gene tables
     self.tables_dir = \
         os.path.join(self.settings_info["pipeline-files"]["init_dir"],
                      "ucsc")
     if not os.path.isdir(self.tables_dir):
         print "Error: %s directory does not exist." \
             %(self.tables_dir)
         sys.exit(1)
     self.const_exons_gff = os.path.join(self.tables_dir, "exons",
                                         "const_exons",
                                         "ensGene.const_exons.gff")
     if not os.path.isfile(self.const_exons_gff):
         print "Error: Const. exons GFF %s does not exist." \
             %(self.const_exons_gff)
         sys.exit(1)