예제 #1
0
def extract_mature_mirna_location(args):
    from utils import read_gff, GFFRecord
    from ioutils import open_file_or_stdin, open_file_or_stdout
    from collections import OrderedDict, defaultdict

    logger.info('read input GFF file: ' + args.input_file)
    fin = open_file_or_stdin(args.input_file)
    logger.info('open output BED file: ' + args.input_file)
    fout = open_file_or_stdout(args.output_file)
    # key: precursor_id, value: precursor record
    precursors = OrderedDict()
    # key: precursor_id, value: list of mature records
    matures = defaultdict(list)
    # read features from GFF file
    for record in read_gff(fin):
        if record.feature == 'miRNA_primary_transcript':
            precursors[record.attr['ID']] = record
        elif record.feature == 'miRNA':
            matures[record.attr['Derives_from']].append(record)
    # get locations of mature miRNAs
    for precursor_id, precursor in precursors.items():
        for mature in matures[precursor_id]:
            if mature.strand == '+':
                fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format(
                    precursor.attr['Name'], mature.start - precursor.start,
                    mature.end - precursor.start + 1, mature.attr['Name']))
            else:
                fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format(
                    precursor.attr['Name'], precursor.end - mature.end,
                    precursor.end - mature.start + 1, mature.attr['Name']))
    fin.close()
    fout.close()
예제 #2
0
def chrom_sizes(args):
    from Bio import SeqIO
    from ioutils import open_file_or_stdin, open_file_or_stdout

    fout = open_file_or_stdout(args.output_file)
    with open_file_or_stdin(args.input_file) as fin:
        for record in SeqIO.parse(fin, 'fasta'):
            fout.write('{}\t{}\n'.format(record.id, len(record.seq)))
예제 #3
0
def extract_longest_transcript(args):
    from ioutils import open_file_or_stdin, open_file_or_stdout
    from collections import defaultdict
    from functools import partial

    feature = args.feature
    genes = defaultdict(partial(defaultdict, int))
    lines = []
    logger.info('read gtf file: ' + args.input_file)
    with open_file_or_stdin(args.input_file) as fin:
        lineno = 0
        for line in fin:
            lineno += 1
            c = line.strip().split('\t')
            if c[0].startswith('#'):
                continue
            if c[2] != feature:
                lines.append(('#other#', line))
                continue
            attrs = {}
            for a in c[8].split(';')[:-1]:
                a = a.strip()
                i = a.find(' ')
                key = a[:i]
                val = a[(i + 1):].strip('"')
                attrs[key] = val
            transcript_id = attrs.get('transcript_id')
            if transcript_id is None:
                raise ValueError(
                    'transcript_id not found in GTF file at line {}'.format(
                        lineno))
            gene_id = attrs.get('gene_id')
            if gene_id is None:
                raise ValueError(
                    'gene_id not found in GTF file at line {}'.format(lineno))
            lines.append((transcript_id, line))
            genes[gene_id][transcript_id] += int(c[4]) - int(c[3]) + 1
    kept_transcripts = set()
    kept_transcripts.add('#other#')
    for gene_id, gene in genes.items():
        max_length = 0
        max_transcript = None
        for transcript_id, length in gene.items():
            if length > max_length:
                max_length = length
                max_transcript = transcript_id
        kept_transcripts.add(transcript_id)

    logger.info('number of genes: {}'.format(len(genes)))
    logger.info('number of transcripts: {}'.format(
        sum(map(len, genes.values()))))
    logger.info(
        'number of longest transcripts: {}'.format(len(kept_transcripts) - 1))
    logger.info('write output gtf file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        for transcript_id, line in lines:
            if transcript_id in kept_transcripts:
                fout.write(line)
예제 #4
0
def gtf_to_transcript_table(args):
    from ioutils import open_file_or_stdin, open_file_or_stdout
    from collections import OrderedDict

    feature = args.feature
    default_transcript_type = args.transcript_type
    default_gene_type = args.gene_type

    fout = open_file_or_stdout(args.output_file)
    with open_file_or_stdin(args.input_file) as fin:
        transcripts = OrderedDict()
        for line in fin:
            c = line.strip().split('\t')
            if c[0].startswith('#'):
                continue
            if c[2] != feature:
                continue
            attrs = {}
            for a in c[8].split(';')[:-1]:
                a = a.strip()
                i = a.find(' ')
                key = a[:i]
                val = a[(i + 1):].strip('"')
                attrs[key] = val
            if 'transcript_name' not in attrs:
                attrs['transcript_name'] = attrs['transcript_id']
            if 'gene_name' not in attrs:
                attrs['gene_name'] = attrs['gene_id']
            if default_transcript_type is not None:
                attrs['transcript_type'] = default_transcript_type
            else:
                if 'transcript_type' not in attrs:
                    attrs['transcript_type'] = 'unknown'
            if default_gene_type is not None:
                attrs['gene_type'] = default_gene_type
            else:
                if 'gene_type' not in attrs:
                    attrs['gene_type'] = 'unknown'
            exon = [c[0], int(c[3]) - 1, int(c[4]), attrs['gene_id'], 0, c[6],
                attrs['gene_id'], attrs['transcript_id'], 
                attrs['gene_name'], attrs['transcript_name'],
                attrs['gene_type'], attrs['transcript_type'], c[1]]
            transcript = transcripts.get(attrs['transcript_id'])
            if transcript is None:
                transcripts[attrs['transcript_id']] = exon
            else:
                if c[2] == 'exon':
                    transcript[1] = min(transcript[1], exon[1])
                    transcript[2] = max(transcript[2], exon[2])
        header = ['chrom', 'start', 'end', 'name', 'score', 'strand',
            'gene_id', 'transcript_id', 
            'gene_name', 'transcript_name',
            'gene_type', 'transcript_type', 'source'
        ]
        print('\t'.join(header), file=fout)
        for transcript in transcripts.values():
            print('\t'.join(str(a) for a in transcript), file=fout)
    fout.close()
예제 #5
0
def normalize(args):
    from ioutils import open_file_or_stdin, open_file_or_stdout
    import pandas as pd

    with open_file_or_stdin(args.input_file) as f:
        matrix = pd.read_table(f, sep='\t', index_col=0)
    if args.method == 'cpm':
        matrix = 1e6 * matrix.astype('float') / matrix.sum(axis=0)
    with open_file_or_stdout(args.output_file) as f:
        matrix.to_csv(f, sep='\t', header=True, index=True, na_rep='NA')
예제 #6
0
def calculate_gene_length(args):
    import HTSeq
    from collections import defaultdict
    from functools import partial
    import numpy as np
    from ioutils import open_file_or_stdin
    from tqdm import tqdm

    fin = open_file_or_stdin(args.input_file)
    gff = HTSeq.GFF_Reader(fin)
    exons = defaultdict(partial(defaultdict, int))
    for feature in tqdm(gff, unit='feature'):
        if feature.type == 'exon':
            exons[feature.attr['gene_id']][
                feature.attr['transcript_id']] += feature.iv.length
예제 #7
0
def print_fasta(args):
    from ioutils import open_file_or_stdin
    from Bio import SeqIO

    with open_file_or_stdin(args.input_file) as f:
        for record in SeqIO.parse(f, 'fasta'):
            seq_id, label, start, end = record.id.split(',')
            seq = str(record.seq)
            start = int(start)
            end = int(end)
            print('>{}'.format(record.id))
            if label == '1':
                print('{}\x1B[1;31m{}\x1B[0m{}'.format(seq[:start], seq[start:end], seq[end:]))
            else:
                print(seq)
예제 #8
0
def filter_circrna_reads(args):
    import pysam
    import numpy as np
    from ioutils import open_file_or_stdout, open_file_or_stdin
    from collections import defaultdict
    from copy import deepcopy

    logger.info('read input SAM file: ' + args.input_file)
    fin = open_file_or_stdin(args.input_file)
    sam_in = pysam.AlignmentFile(fin, "r")
    if sam_in.header is None:
        raise ValueError('requires SAM header to get junction positions')
    # get junction positions (middle of the sequences)
    junction_positions = {}
    for sq in sam_in.header['SQ']:
        junction_positions[sq['SN']] = sq['LN'] // 2

    logger.info('create output SAM file: ' + args.output_file)
    fout = open_file_or_stdout(args.output_file)
    sam_out = pysam.AlignmentFile(fout, 'w', template=sam_in)

    sam_filtered = None
    if args.filtered_file is not None:
        logger.info('create filtered SAM file: ' + args.filtered_file)
        sam_filtered = pysam.AlignmentFile(args.filtered_file,
                                           'w',
                                           template=sam_in)

    for read in sam_in:
        filtered = False
        if read.is_unmapped:
            filtered = True
        elif read.is_reverse:
            filtered = True
        else:
            pos = junction_positions[read.reference_name]
            if not (read.reference_start < pos <= read.reference_end):
                filterd = True
        if not filtered:
            sam_out.write(read)
        elif sam_filtered is not None:
            sam_filtered.write(read)

    fin.close()
    fout.close()
    if sam_filtered is not None:
        sam_filtered.close()
예제 #9
0
def flagstat(args):
    import pysam
    from ioutils import open_file_or_stdin, open_file_or_stdout

    logger.info('read input file: ' + args.input_file)
    fin = open_file_or_stdin(args.input_file)
    sam = pysam.AlignmentFile(fin, 'rb')
    counts = [0] * 4096
    for read in sam:
        counts[read.flag] += 1
    sam.close()

    logger.info('create output file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        fout.write('flag\tcounts\n')
        for flag, count in enumerate(counts):
            if count > 0:
                fout.write('{}\t{}\n'.format(flag, count))
예제 #10
0
def extract_circrna_junction(args):
    from Bio import SeqIO
    from ioutils import open_file_or_stdin, open_file_or_stdout

    anchor_size = args.anchor_size
    logger.info('read sequence file: ' + args.input_file)
    logger.info('create output file: ' + args.output_file)
    fout = open_file_or_stdout(args.output_file)
    with open_file_or_stdin(args.input_file) as fin:
        for record in SeqIO.parse(fin, 'fasta'):
            seq = str(record.seq)
            if len(seq) < args.min_length:
                continue
            s = min(len(seq), anchor_size)
            seq_id = record.id.split('|')[0]
            fout.write('>{}\n'.format(seq_id))
            fout.write(seq[-s:] + seq[:s])
            fout.write('\n')
    fout.close()
예제 #11
0
def read_gtf(filename):
    from ioutils import open_file_or_stdin

    with open_file_or_stdin(filename) as fin:
        lineno = 0
        for line in fin:
            lineno += 1
            c = line.strip().split('\t')
            if c[0].startswith('#'):
                continue
            attrs = {}
            for a in c[8].split(';')[:-1]:
                a = a.strip()
                i = a.find(' ')
                key = a[:i]
                val = a[(i + 1):].strip('"')
                attrs[key] = val
            gene_id = attrs.get('gene_id')
            if gene_id is None:
                raise ValueError('gene_id not found in GTF file at line {}'.format(lineno))
            yield (c, attrs, line)
예제 #12
0
def calc_rpkm(args):
    import pandas as pd
    import numpy as np
    from ioutils import open_file_or_stdin, open_file_or_stdout

    matrix = pd.read_table(open_file_or_stdin(args.input_file),
                           index_col=0,
                           sep='\t')
    feature_info = matrix.index.to_series().str.split('|', expand=True)
    feature_info.columns = [
        'gene_id', 'gene_type', 'gene_name', 'feature_id', 'transcript_id',
        'start', 'end'
    ]
    feature_info['start'] = feature_info['start'].astype('int')
    feature_info['end'] = feature_info['end'].astype('int')
    feature_info['length'] = feature_info['end'] - feature_info['start']
    matrix = 1000.0 * matrix.div(feature_info['length'], axis=0)
    matrix.to_csv(open_file_or_stdout(args.output_file),
                  index=True,
                  header=True,
                  sep='\t',
                  na_rep='NA')