예제 #1
0
def main(args):
    """Main program"""
    (ribo_file, transcriptome_fasta, read_lengths, read_offsets, count_five, count_three,
     output_path, html_file) = \
        (args.ribo_file, args.transcriptome_fasta, args.read_lengths, args.read_offsets,
         args.count_five, args.count_three, args.output_path, args.html_file)

    log.debug('Supplied arguments\n{}'.format(
        '\n'.join(['{:<20}: {}'.format(k, v) for k, v in vars(args).items()])))

    # error messages (simple format) are written to html file
    fh = logging.FileHandler(html_file)
    fh.setLevel(logging.ERROR)
    fh.setFormatter(ErrorLogFormatter('%(message)s'))
    log.addHandler(fh)

    log.info('Checking if required arguments are valid...')
    ribocore.check_required_arguments(ribo_file=ribo_file, transcriptome_fasta=transcriptome_fasta)

    log.info('Checking read lengths...')
    ribocore.check_read_lengths(ribo_file=ribo_file, read_lengths=read_lengths)
    log.info('Done')

    log.info('Checking read offsets...')
    ribocore.check_read_offsets(read_offsets=read_offsets)
    log.info('Done')

    log.info('Checking if each read length has a corresponding offset...')
    ribocore.check_read_lengths_offsets(read_lengths=read_lengths, read_offsets=read_offsets)
    log.info('Done')

    with ribocore.open_pysam_file(fname=ribo_file, ftype='bam') as b, ribocore.open_pysam_file(fname=transcriptome_fasta, ftype='fasta') as f:
        # Total valid transcript count (ones with reads)
        count = 0
        prime = None
        table_body = ''  # HTML table body content
        if count_five:
            log.info('Only 5\' read counts requested')
            prime = '5'
        elif count_three:
            log.info('Only 3\' read counts requested')
            prime = '3'

        # create output directories
        if not os.path.exists(output_path):
            os.mkdir(output_path)

        # zip_dir contents will be written here and a zip archive will be created
        # from this directory
        zip_dir = os.path.join(output_path, 'ribocount_output')
        if not os.path.exists(zip_dir):
            os.mkdir(zip_dir)

        csv_dir = os.path.join(zip_dir, 'csv')
        if not os.path.exists(csv_dir):
            os.mkdir(csv_dir)

        log.info('Get RiboSeq read counts for all transcripts in FASTA')
        for transcript in f.references:
            ribo_counts, ribo_reads = ribocore.get_ribo_counts(ribo_fileobj=b, transcript_name=transcript,
                                                               read_lengths=read_lengths, read_offsets=read_offsets)
            if not ribo_reads:  # no reads for this transcript. skip.
                continue

            transcript_sequence = f[transcript]
            # By default, all counts will be written (ribo_counts)
            # If 5' or 3' counts requested, filter and use
            # those counts for printing instead
            write_counts = ribo_counts
            log.debug('Total read counts {}'.format(ribo_reads))

            # find longest ORF and filter counts based on whether 5' or 3' is
            # requested
            longest_orf = {}
            if count_five or count_three:
                # use default start and stop codons and find ORFs in all 3
                # frames (+)
                orfs = ribocore.get_three_frame_orfs(sequence=transcript_sequence)
                if not len(orfs):
                    log.debug('No ORFs for transcript {0}'.format(transcript))
                    continue
                longest_orf = ribocore.get_longest_orf(orfs=orfs)
                orf_start, orf_stop = longest_orf['start'], longest_orf['stop']
                log.info('Transcript: {0} Longest ORF Start: {1}, Stop: {2}'.format(transcript, orf_start, orf_stop))

                if count_five:
                    write_counts, five_reads = ribocore.filter_ribo_counts(counts=ribo_counts, orf_start=orf_start)
                    log.debug('5\' region read counts: {}'.format(five_reads))
                elif count_three:
                    write_counts, three_reads = ribocore.filter_ribo_counts(counts=ribo_counts, orf_stop=orf_stop)
                    log.debug('3\' region read counts: {}'.format(three_reads))

            if not len(write_counts):
                # no counts for transcript
                continue

            log.debug('Writing counts to CSV file for transcript {}'.format(transcript))
            count += 1
            csv_file = 'RiboCounts{}.csv'.format(count)
            with open(os.path.join(csv_dir, csv_file), 'w') as cw:
                cw.write('"Position","Nucleotide","Frame 1","Frame 2","Frame 3"\n')
                for pos in range(1, len(transcript_sequence) + 1):
                    nucleotide = transcript_sequence[pos - 1]
                    if pos in write_counts:
                        cw.write('{0},{1},{2},{3},{4}\n'.format(
                            pos, nucleotide, write_counts[pos][1], write_counts[pos][2], write_counts[pos][3]))
                    else:
                        cw.write('{0},{1},{2},{3},{4}\n'.format(pos, nucleotide, 0, 0, 0))
            # HTML table
            table_body += '<tr><td>{0}</td><td>{1}</td>'.format(transcript, ribo_reads)
            if count_five:
                table_body += '<td>{0}</td>'.format(five_reads)
            elif count_three:
                table_body += '<td>{0}</td>'.format(three_reads)
            table_body += '<td><a href="csv/{0}">{0}</a></td></tr>'.format(csv_file)
        table_body += '</tbody>'

    # only for display in HTML
    valid_lengths = ['{}'.format(item) for item in read_lengths]
    if len(valid_lengths) == 1 and valid_lengths[0] == '0':
        valid_lengths = ['All']

    if not count:
        if len(valid_lengths) >= 1:
            log.info('No transcripts found for read lengths: {}'.format(', '.join(valid_lengths)))
        else:
            log.info('No transcripts found')
    else:
        if prime:
            template = 'ribocount_prime.html'
        else:
            template = 'ribocount.html'
        with open(os.path.join(CONFIG.PKG_DATA_DIR, template)) as g,\
                open(os.path.join(zip_dir, 'index.html'), 'w') as h:
            h.write(g.read().format(count=count, length='{}'.format(', '.join(valid_lengths)),
                                    prime=prime, table_body=table_body))

        for asset in ('css', 'js'):
            asset_dir = os.path.join(zip_dir, asset)
            if not os.path.exists(asset_dir):
                os.mkdir(asset_dir)
            asset_data_dir = os.path.join(CONFIG.PKG_DATA_DIR, asset)
            for fname in os.listdir(asset_data_dir):
                shutil.copy(os.path.join(asset_data_dir, fname),
                            os.path.join(zip_dir, asset, fname))

        log.info('Creating zip file')
        os.chdir(output_path)
        with zipfile.ZipFile('ribocount_output.zip', 'w') as zipf:
            for root, d, f in os.walk('ribocount_output'):
                for name in f:
                    zipf.write(os.path.join(root, name))
        shutil.rmtree('ribocount_output')
        os.chdir('../')
        log.debug('Writing HTML report')

        with open(os.path.join(CONFIG.PKG_DATA_DIR, 'ribocount_index.html')) as j, open(args.html_file, 'w') as k:
            k.write(j.read().format(count=count, read_length=', '.join(valid_lengths)))
    log.info('Finished')
예제 #2
0
def main(args):
    """Main program"""
    (ribo_file, rna_file, transcript_name, transcriptome_fasta, read_lengths, read_offsets, output_path, html_file) = (
        args.ribo_file,
        args.rna_file,
        args.transcript_name,
        args.transcriptome_fasta,
        args.read_lengths,
        args.read_offsets,
        args.output_path,
        args.html_file,
    )

    # error messages (simple format) are written to html file
    fh = logging.FileHandler(html_file)
    fh.setLevel(logging.ERROR)
    fh.setFormatter(ErrorLogFormatter("%(message)s"))
    log.addHandler(fh)

    log.debug("Supplied arguments\n{}".format("\n".join(["{:<20}: {}".format(k, v) for k, v in vars(args).items()])))
    log.debug("Testing debugggg")
    log.info("Checking if required arguments are valid...")
    ribocore.check_required_arguments(
        ribo_file=ribo_file, transcriptome_fasta=transcriptome_fasta, transcript_name=transcript_name
    )
    log.info("Done")

    if rna_file:
        log.info("Checking if RNA-Seq file is valid...")
        ribocore.check_rna_file(rna_file=rna_file)
        log.info("Done")

    log.info("Checking read lengths...")
    ribocore.check_read_lengths(ribo_file=ribo_file, read_lengths=read_lengths)
    log.info("Done")

    log.info("Checking read offsets...")
    ribocore.check_read_offsets(read_offsets=read_offsets)
    log.info("Done")

    log.info("Checking if each read length has a corresponding offset")
    ribocore.check_read_lengths_offsets(read_lengths=read_lengths, read_offsets=read_offsets)
    log.info("Done")

    log.info("Get sequence and length of the given transcript from FASTA file...")
    record = ribocore.get_fasta_record(transcriptome_fasta, transcript_name)
    transcript_sequence = record[transcript_name]
    transcript_length = len(transcript_sequence)

    log.info("Get ribo-seq read counts and total reads in Ribo-Seq...")
    with ribocore.open_pysam_file(fname=ribo_file, ftype="bam") as bam_fileobj:
        ribo_counts, total_reads = ribocore.get_ribo_counts(
            ribo_fileobj=bam_fileobj,
            transcript_name=transcript_name,
            read_lengths=read_lengths,
            read_offsets=read_offsets,
        )

    if not ribo_counts:
        msg = "No RiboSeq read counts for transcript {}. No plot will be " "generated!".format(transcript_name)
        log.error(msg)
        raise ribocore.RiboPlotError(msg)
    else:
        log.info("Get RNA counts for the given transcript...")
        mrna_counts = {}
        if rna_file:
            try:
                mrna_counts = get_rna_counts(rna_file, transcript_name)
            except OSError as e:
                log.error(e)
                raise

            if not mrna_counts:
                log.warn(
                    "No RNA counts for this transcript from the given RNA Seq file. "
                    "RNA-Seq coverage will not be generated"
                )
        else:
            log.debug("No RNA-Seq data provided. Not generating coverage")

        log.info("Get start/stop positions in transcript sequence (3 frames)...")
        codon_positions = get_start_stops(transcript_sequence)

        if not os.path.exists(output_path):
            os.mkdir(output_path)

        log.info("Writing RiboSeq read counts for {}".format(transcript_name))
        with open(os.path.join(output_path, "RiboCounts.csv"), "w") as f:
            f.write('"Position","Nucleotide","Frame 1","Frame 2","Frame 3"\n')

            for pos in range(1, transcript_length + 1):
                if pos in ribo_counts:
                    f.write(
                        "{0},{1},{2},{3},{4}\n".format(
                            pos,
                            transcript_sequence[pos - 1],
                            ribo_counts[pos][1],
                            ribo_counts[pos][2],
                            ribo_counts[pos][3],
                        )
                    )
                else:
                    f.write("{0},{1},{2},{3},{4}\n".format(pos, transcript_sequence[pos - 1], 0, 0, 0))

        log.info("Generating RiboPlot...")
        plot_profile(
            ribo_counts,
            transcript_name,
            transcript_length,
            codon_positions,
            read_lengths,
            read_offsets,
            mrna_counts,
            color_scheme=args.color_scheme,
            html_file=args.html_file,
            output_path=args.output_path,
        )
    log.info("Finished!")
예제 #3
0
def main(args):
    """Main program"""
    (ribo_file, rna_file, transcript_name, transcriptome_fasta, read_lengths,
     read_offsets, output_path,
     html_file) = (args.ribo_file, args.rna_file, args.transcript_name,
                   args.transcriptome_fasta, args.read_lengths,
                   args.read_offsets, args.output_path, args.html_file)

    # error messages (simple format) are written to html file
    fh = logging.FileHandler(html_file)
    fh.setLevel(logging.ERROR)
    fh.setFormatter(ErrorLogFormatter('%(message)s'))
    log.addHandler(fh)

    log.debug('Supplied arguments\n{}'.format('\n'.join(
        ['{:<20}: {}'.format(k, v) for k, v in vars(args).items()])))
    log.debug('Testing debugggg')
    log.info('Checking if required arguments are valid...')
    ribocore.check_required_arguments(ribo_file=ribo_file,
                                      transcriptome_fasta=transcriptome_fasta,
                                      transcript_name=transcript_name)
    log.info('Done')

    if rna_file:
        log.info('Checking if RNA-Seq file is valid...')
        ribocore.check_rna_file(rna_file=rna_file)
        log.info('Done')

    log.info('Checking read lengths...')
    ribocore.check_read_lengths(ribo_file=ribo_file, read_lengths=read_lengths)
    log.info('Done')

    log.info('Checking read offsets...')
    ribocore.check_read_offsets(read_offsets=read_offsets)
    log.info('Done')

    log.info('Checking if each read length has a corresponding offset')
    ribocore.check_read_lengths_offsets(read_lengths=read_lengths,
                                        read_offsets=read_offsets)
    log.info('Done')

    log.info(
        'Get sequence and length of the given transcript from FASTA file...')
    record = ribocore.get_fasta_record(transcriptome_fasta, transcript_name)
    transcript_sequence = record[transcript_name]
    transcript_length = len(transcript_sequence)

    log.info('Get ribo-seq read counts and total reads in Ribo-Seq...')
    with ribocore.open_pysam_file(fname=ribo_file, ftype='bam') as bam_fileobj:
        ribo_counts, total_reads = ribocore.get_ribo_counts(
            ribo_fileobj=bam_fileobj,
            transcript_name=transcript_name,
            read_lengths=read_lengths,
            read_offsets=read_offsets)

    if not ribo_counts:
        msg = ('No RiboSeq read counts for transcript {}. No plot will be '
               'generated!'.format(transcript_name))
        log.error(msg)
        raise ribocore.RiboPlotError(msg)
    else:
        log.info('Get RNA counts for the given transcript...')
        mrna_counts = {}
        if rna_file:
            try:
                mrna_counts = get_rna_counts(rna_file, transcript_name)
            except OSError as e:
                log.error(e)
                raise

            if not mrna_counts:
                log.warn(
                    'No RNA counts for this transcript from the given RNA Seq file. '
                    'RNA-Seq coverage will not be generated')
        else:
            log.debug('No RNA-Seq data provided. Not generating coverage')

        log.info(
            'Get start/stop positions in transcript sequence (3 frames)...')
        codon_positions = get_start_stops(transcript_sequence)

        if not os.path.exists(output_path):
            os.mkdir(output_path)

        log.info('Writing RiboSeq read counts for {}'.format(transcript_name))
        with open(os.path.join(output_path, 'RiboCounts.csv'), 'w') as f:
            f.write('"Position","Nucleotide","Frame 1","Frame 2","Frame 3"\n')

            for pos in range(1, transcript_length + 1):
                if pos in ribo_counts:
                    f.write('{0},{1},{2},{3},{4}\n'.format(
                        pos, transcript_sequence[pos - 1], ribo_counts[pos][1],
                        ribo_counts[pos][2], ribo_counts[pos][3]))
                else:
                    f.write('{0},{1},{2},{3},{4}\n'.format(
                        pos, transcript_sequence[pos - 1], 0, 0, 0))

        log.info('Generating RiboPlot...')
        plot_profile(ribo_counts,
                     transcript_name,
                     transcript_length,
                     codon_positions,
                     read_lengths,
                     read_offsets,
                     mrna_counts,
                     color_scheme=args.color_scheme,
                     html_file=args.html_file,
                     output_path=args.output_path)
    log.info('Finished!')