Exemplo n.º 1
0
def check_consensus(gfname, faname):
    """
    checking the consensus sequence of the specific signal 

    @args gfname: genome annotation in gtf/gff file
    @type gfname: str  
    @args faname: genome sequence in fasta file 
    @type faname: str  
    """

    ## extract genome annotation from gtf file 
    gtf_file_content = GFFParser.Parse(gfname)
    print 'processed annotation file'

    ## signals considering 
    for signal in ['splice', 'tis']:
        
        gtf_db, feature_cnt = get_label_regions(gtf_file_content, signal)
        print 'extracted %d %s signal regions' % (feature_cnt, signal) 

        if signal == 'splice':

            don_true_seq, acc_true_seq, don_fal_seq, acc_fal_seq = true_ss_seq_fetch(faname, gtf_db, boundary=100)
            print 'summary of', signal, 'signal consensus'

            print 'don site cons %f non-cons %f' % (round((don_true_seq/feature_cnt)*100, 2), round((don_fal_seq/feature_cnt)*100, 2))
            print 'acc site cons %f non-cons %f' % (round((acc_true_seq/feature_cnt)*100, 2), round((acc_fal_seq/feature_cnt)*100, 2))

            break ## one singal 
Exemplo n.º 2
0
def get_features(gtf_file, fa_file):
    """
    get the total number of genes based on the annotation source  
    # of coding genes 
    genes with 3' 5' UTR
    other different class of genes
    """

    anno_db = GFFParser.Parse(gtf_file) 
    total_genes = len(anno_db) 

    non_coding = 0
    cds_coding = 0
    total_transcripts = 0 
    utr = 0 
    utr5 = 0
    utr3 = 0
    no_utr = 0 

    for features in anno_db:
        total_transcripts += len(features['transcripts'])

        for trans in features['cds_exons']:
            if trans.any():
                cds_coding += 1
            else:
                non_coding += 1

        for idx, trans in enumerate(features['transcripts']):
            if features['utr3_exons'][idx].any() and features['utr5_exons'][idx].any():
                utr +=1
            elif features['utr3_exons'][idx].any():
                utr3 +=1
            elif features['utr5_exons'][idx].any():
                utr5 += 1
            else:
                no_utr +=1 
    
    print 'total genes: ', total_genes
    print 'total transcripts: ', total_transcripts
    print 'coding transcripts: ', cds_coding
    print 'noncoding transcripts: ', non_coding
    print '---------------'
    print 'both utrs', utr 
    print '3 utr ', utr3
    print '5 utr ', utr5
    print 'no utr ', no_utr
Exemplo n.º 3
0
def create_star_genome_index(fasta_file, out_dir, genome_anno=None, num_workers=1, onematelength=100):
    """
    Creating STAR genome index with or without using genome annotation

    TODO check whether the fasta and gtf files are uncompressed star works with uncompressed files in this step. 

    @args fasta_file: reference genome sequence file .fasta format 
    @type fasta_file: str 
    @args out_dir: genome index binary file storage place  
    @type out_dir: str 
    @args genome_anno: genome annotation file (optional) 
    @type genome_anno: str 
    @args num_workers: number of threads to run (default value = 1)
    @type num_workers: int 
    @args onematelength: One Mate Length (default value=100) 
    @type num_workers: int 
    """

    try:
        subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")
    
    if not genome_anno:
        cli_cmd = 'STAR \
        --runMode genomeGenerate \
        --genomeDir %s \
        --genomeFastaFiles %s \
        --runThreadN %d' % (out_dir, fasta_file, num_workers) 
    else:
        ## check for the file type  
        gff_hand = helper.open_file(genome_anno)
    
        for rec in gff_hand:
            rec = rec.strip('\n\r')

            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in  ['#', '>']:
                continue
            # skip the genome sequence 
            if not re.search('\t', rec):
                continue

            parts = rec.split('\t')
            assert len(parts) >= 8, rec

            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break 

        gff_hand.close() 

        ## according to the file type 
        if ftype:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFtagExonParentTranscript Parent \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) 
        else:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFfeatureExon exon \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) 

    ## create downloadpath if doesnot exists 
    if not os.path.exists(out_dir):
        try:
            os.makedirs(out_dir)
        except OSError:
            print "error: cannot create the directory %s." % out_dir
            sys.exit(0)
    else:## if present any other old index files clean up the folder 
        for the_file in os.listdir(out_dir):
            file_path = os.path.join(out_dir, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception, e:
                print e 
Exemplo n.º 4
0
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1):
    """
    wrapper for running STAR program 

    @args org_db: a python dictionary with all details about a single organism 
    @type org_db: defaultdict
    @args read_type: library type - paired-end or single-end (default: PE)
    @type read_type: str 
    @args max_mates_gap_length: maximum insert size from the sample (default: 10000)
    @type max_mates_gap_length: int 
    @args num_cpus: number of threads to use for the run (default: 1)
    @type num_cpus: int 
    """
    try:
        subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")

    from gfftools import helper, GFFParser

    genome_dir = org_db['genome_index_dir']## genome indices and annotation file
    gtf_db = org_db['gtf']

    if gtf_db != None: 
        ## check for the annotation file type gff or gtf 
        gff_hand = helper.open_file(gtf_db)
        for rec in gff_hand:
            rec = rec.strip('\n\r')
            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in  ['#', '>']:
                continue
            # skip the genome sequence 
            if not re.search('\t', rec):
                continue
            parts = rec.split('\t')
            assert len(parts) >= 8, rec
            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break 
        gff_hand.close() 

    ## library type 
    if read_type == 'PE':
        read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1])
    else:
        read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0])
    
    ## getting the command to uncompress the read file
    zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} 
    file_prefx, ext = os.path.splitext(org_db['fastq'][0])

    out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name'])

    ## genomic feature information 
    max_lenth_intron = org_db['max_intron_len']

    ## according to the file type 
    if gtf_db == None:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 4 \
        --sjdbScore 1 \
        --sjdbOverhang 5 \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus)
    elif ftype:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 4 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFtagExonParentTranscript Parent \
        --sjdbScore 1 \
        --sjdbOverhang 5 \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db)
    else:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 4 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFfeatureExon exon \
        --sjdbScore 1 \
        --sjdbOverhang 5 \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db)

    sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run)
    try:
        process = subprocess.Popen(make_star_run, shell=True) 
        returncode = process.wait()

        if returncode !=0:
            raise Exception, "Exit status return code = %i" % returncode

        sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix)
    except Exception, e:
        sys.exit("Error running STAR.\n%s" %  str( e ))
Exemplo n.º 5
0
def trsk_gene_len_dist(gtf_file, out_file="hist_cds_len.pdf"):
    """
    plotting the histograms bases on the genes and CDS length
    """
    import matplotlib.pyplot as plt

    anno_db = GFFParser.Parse(gtf_file)

    cds_idx = []  # deleting the empty cds lines
    for idp, feat in enumerate(anno_db):
        if not feat['cds_exons'][0].any():
            cds_idx.append(idp)

    anno_db = np.delete(anno_db, cds_idx)

    trans_len = np.zeros((len(anno_db), 2))
    genes = []

    for idx, feat in enumerate(anno_db):
        cds_len = 0
        for exc in feat['cds_exons'][0]:
            cds_len += exc[1] - exc[0]

        trans_len[idx, 0] = feat['stop'] - feat['start']
        trans_len[idx, 1] = cds_len
        genes.append(feat['name'])

    ## gene, cds length information
    df_len_dis_genes = pd.DataFrame(trans_len,
                                    columns=['gene_len', 'cds_len'],
                                    index=genes)

    ## plotting the gene length based on the bins of gene length
    gene_length = trans_len[:, 0]  ## gene length from the matrix

    freq, bins = np.histogram(gene_length,
                              bins=10,
                              range=None,
                              normed=False,
                              weights=None)
    bins = np.delete(bins, 10)

    df_gene_len_bin = pd.DataFrame(freq,
                                   columns=['gene_frequency'],
                                   index=bins)
    plt.figure()
    df_gene_len_bin.plot(kind="bar")
    #plt.savefig()

    ## plotting the cds length distribution
    cds_length = trans_len[:, 1]  ## cds length distribution
    freq, bins = np.histogram(cds_length,
                              bins=10,
                              range=None,
                              normed=False,
                              weights=None)
    bins = np.delete(bins, 10)
    df_cds_len_bin = pd.DataFrame(freq, columns=['cds_frequency'], index=bins)
    plt.figure()
    df_cds_len_bin.plot(kind="bar")
    plt.savefig(out_file)
Exemplo n.º 6
0
def translate_trsk_genes(gtf_file, fas_file, out_seq_fname):
    """
    translate the trsk genes to protein sequence 

    @args gtf_file: genome annotation file 
    @type gtf_file: str 
    @args fas_file: genome sequence file
    @type fas_file: str
    @args out_seq_fname: output file in fasta format 
    @type out_seq_fname: str
    """

    if filecmp.cmp(gtf_file, fas_file):
        exit("Do the two files are exactly same? Please check that!")

    ## reading the TSkim file to get the features
    sys.stdout.write('reading genome features from %s\n' % gtf_file)
    anno_db = GFFParser.Parse(gtf_file)
    total_genes = len(anno_db)

    ## genome sequence file reading
    sys.stdout.write('reading genome sequence from %s\n' % fas_file)
    seqlab.chrom_name_consistency(fas_file, anno_db)

    cds_idx = []  # deleting the empty cds lines
    for idp, feat in enumerate(anno_db):
        if not feat['cds_exons'][0].any(
        ):  # TSkim annotation expects only single transcript from a region
            cds_idx.append(idp)
    anno_db = np.delete(anno_db, cds_idx)
    genes_with_cds = len(anno_db)

    fasFH = helper.open_file(fas_file)
    out_seq_fh = open(out_seq_fname, "w")
    for rec in SeqIO.parse(fasFH, "fasta"):
        for idx, feature in enumerate(anno_db):
            if rec.id == feature['chr']:
                ## iterate over cds_exons
                cds_seq = ''
                for ex in feature['cds_exons'][
                        0]:  ## single transcript by TSkim
                    cds_seq += rec.seq[ex[0] - 1:ex[1]]

                if feature['strand'] == '-':
                    cds_seq = cds_seq.reverse_complement()
                ##
                #sys.stdout.write(str(cds_seq.translate()) + "\n")

                ## fasta output
                if cds_seq:
                    prt_seq = SeqRecord(cds_seq.translate(),
                                        id=feature['name'],
                                        description='protein sequence')
                    out_seq_fh.write(prt_seq.format("fasta"))

        # FIXME need an efficient way to translate multiple gene
        # iterate over chromosome

    fasFH.close()
    out_seq_fh.close()

    sys.stdout.write('total genes fetched: %d\n' % total_genes)
    sys.stdout.write('total genes translated: %d\n' % genes_with_cds)
    sys.stdout.write('protein sequence stored at %s\n' % out_seq_fname)
Exemplo n.º 7
0
def create_star_genome_index(fasta_file,
                             out_dir,
                             genome_anno=None,
                             num_workers=1,
                             onematelength=100):
    """
    Creating STAR genome index with or without using genome annotation

    @args fasta_file: reference genome sequence file .fasta format 
    @type fasta_file: str 
    @args out_dir: genome index binary file storage place  
    @type out_dir: str 
    @args genome_anno: genome annotation file (optional) 
    @type genome_anno: str 
    @args num_workers: number of threads to run (default value = 1)
    @type num_workers: int 
    @args onematelength: One Mate Length (default value=100) 
    @type onematelength: int 
    """

    try:
        subprocess.call(["STAR"],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")

    file_prefx, ext = os.path.splitext(fasta_file)
    if ext in [".bz2", ".gz", ".lzma"
               ]:  ## checking for the compressed form of the file extension
        exit(
            "error: STAR - Generating genome indexes - recommended to use the uncompressed FASTA file %s."
            % fasta_file)

    if not genome_anno:
        cli_cmd = 'STAR \
        --runMode genomeGenerate \
        --genomeDir %s \
        --genomeFastaFiles %s \
        --runThreadN %d' % (out_dir, fasta_file, num_workers)
    else:
        file_prefx, ext = os.path.splitext(genome_anno)
        if ext in [".bz2", ".gz", ".lzma"]:
            exit(
                "error: STAR - Generating genome indexes - recommended to use the uncompressed GTF/GFF file %s."
                % genome_anno)

        ## check for the file type
        gff_hand = helper.open_file(genome_anno)
        for rec in gff_hand:
            rec = rec.strip('\n\r')

            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in ['#', '>']:
                continue
            # skip the genome sequence
            if not re.search('\t', rec):
                continue
            parts = rec.split('\t')
            assert len(parts) >= 8, rec

            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break
        gff_hand.close()

        ## according to the file type
        if ftype:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFtagExonParentTranscript Parent \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers,
                                  genome_anno, onematelength)
        else:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFfeatureExon exon \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers,
                                  genome_anno, onematelength)

    ## create downloadpath if doesnot exists
    if not os.path.exists(out_dir):
        try:
            os.makedirs(out_dir)
        except OSError:
            exit("error: cannot create the directory %s." % out_dir)
    else:  ## if present any other old index files clean up the folder
        for the_file in os.listdir(out_dir):
            file_path = os.path.join(out_dir, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception, e:
                print(e)
Exemplo n.º 8
0
def make_anno_db(gff_file):
    """
    extract the features from a gtf/gff file and store efficiently to query 

    @args gff_file: genome annotation file
    @type gff_file: str 
    """

    gff_cont = GFFParser.Parse(gff_file)

    intron_size = dict()
    exon_size = dict()

    for rec in gff_cont:
        for idx, tid in enumerate(rec['transcripts']):

            if not rec['exons'][idx].any():
                continue

            try:  # (Pdb) rec['exons'][0] -> array(nan)
                import numpy as np
                if np.isnan(rec['exons'][idx]):
                    continue
            except:
                pass

            try:
                exon_cnt = len(rec['exons'][idx])
            except:
                continue

            if exon_cnt > 1:
                intron_start = 0

                for xq, excod in enumerate(rec['exons'][idx]):

                    if xq > 0:
                        #print intron_start, excod[0]-1
                        if excod[0] - intron_start == 1:
                            intron_start = excod[1] + 1
                            exon_size[intron_start - excod[0]] = 1
                            continue

                        intron_size[excod[0] - intron_start] = 1
                        #print excod[0]-intron_start

                    intron_start = excod[1] + 1
                    exon_size[intron_start - excod[0]] = 1
                    #print intron_start-excod[0]

    feat_db = dict()
    if intron_size:
        keys_int = sorted(intron_size)
        keys_ex = sorted(exon_size)
        #print 'MaxIntronLength %d %d %d'  %(keys_int[-1], keys_int[-2], keys_int[-3])
        feat_db['min_intron'] = int(keys_int[0])
        feat_db['max_intron'] = int(keys_int[-3])

        feat_db['min_exon'] = int(keys_ex[0])
        feat_db['max_exon'] = int(keys_ex[-3])
        #print 'MaxExonLength %d %d %d'  %(keys_ex[-1], keys_ex[-2], keys_ex[-3])

        return feat_db
    else:
        print "Error in feature mapping in file %s, please check the source of parent child features" % gff_file
        sys.exit(-1)
Exemplo n.º 9
0
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1):
    """
    wrapper for running STAR program 

    @args org_db: a python dictionary with all details about a single organism 
    @type org_db: defaultdict
    @args read_type: library type - paired-end or single-end (default: PE)
    @type read_type: str 
    @args max_mates_gap_length: maximum insert size from the sample (default: 10000)
    @type max_mates_gap_length: int 
    @args num_cpus: number of threads to use for the run (default: 1)
    @type num_cpus: int 
    """
    try:
        subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")

    from gfftools import helper, GFFParser

    genome_dir = org_db['genome_index_dir']## genome indices and annotation file
    gtf_db = org_db['gtf']

    if gtf_db != None: 
        ## check for the annotation file type gff or gtf 
        gff_hand = helper.open_file(gtf_db)
        for rec in gff_hand:
            rec = rec.strip('\n\r')
            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in  ['#', '>']:
                continue
            # skip the genome sequence 
            if not re.search('\t', rec):
                continue
            parts = rec.split('\t')
            assert len(parts) >= 8, rec
            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break 
        gff_hand.close() 

    ## library type 
    if read_type == 'PE':
        read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1])
    else:
        read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0])
    
    ## getting the command to uncompress the read file
    zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} 
    file_prefx, ext = os.path.splitext(org_db['fastq'][0])

    out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name'])

    ## genomic feature information 
    max_lenth_intron = org_db['max_intron_len']
    
    ##sjdbOverhang 
    mate_len = org_db['mate_length']

    ## according to the file type 
    if gtf_db == None:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 3 \
        --sjdbScore 1 \
        --sjdbOverhang %d \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, mate_len)
    elif ftype:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 3 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFtagExonParentTranscript Parent \
        --sjdbScore 1 \
        --sjdbOverhang %d \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len)
    else:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 3 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFfeatureExon exon \
        --sjdbScore 1 \
        --sjdbOverhang %d \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len)

    sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run)
    try:
        process = subprocess.Popen(make_star_run, shell=True) 
        returncode = process.wait()

        if returncode !=0:
            raise Exception, "Exit status return code = %i" % returncode

        sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix)
    except Exception, e:
        sys.exit("Error running STAR.\n%s" %  str( e ))
Exemplo n.º 10
0
def filter_gene_models(gff_name, fas_file, outFile):
    """
    check the sequence consistency/quality of predicted fragment

    @args gff_name: result file gff format from TranscriptSkimmer
    @type gff_name: str
    @args fas_file: genome sequence in fasta format
    @type fas_file: str 
    @args outFile: filtered gene output file 
    @type outFile: str 
    """

    sys.stdout.write('using genome sequence file %s\n' % fas_file)
    sys.stdout.write('using genome annotation file %s\n' % gff_name)

    sys.stdout.write("parsing genome annotation file...\n")
    gff_content = GFFParser.Parse(
        gff_name)  ## getting the genome annotation from GFF file
    sys.stdout.write(" ...done\n")

    sys.stdout.write("screening for spliced transcripts...\n")
    orf_short = 0
    spliced_cand = 0
    sing_exon_gen = 0
    transcript_cov = 0
    min_orf_length = 400

    transcripts_region = defaultdict(list)
    for gene_recd in gff_content:  ## screening the spliced transcripts
        spliced_transcript = defaultdict(list)

        for idx, sub_rec in enumerate(gene_recd['transcripts']):
            try:
                exon_cnt = len(gene_recd['exons'][idx])
            except:
                continue

            if exon_cnt > 1:  ## skipping the single-exon transcripts
                if gene_recd['transcript_info'][
                        idx]:  ## discarding the transcript based on the read coverage value
                    if float(
                            numpy.atleast_1d(gene_recd['transcript_info'][idx])
                        [0]) < 10:  ## read coverage value to consider
                        transcript_cov += 1
                        continue

                orf_length = 0
                for idk, ex in enumerate(gene_recd['exons'][idx]):
                    orf_length += ex[1] - (ex[0] - 1)

                    if idk == 0:
                        #ex[0] = None
                        spliced_transcript[(gene_recd['name'], sub_rec[0],
                                            gene_recd['strand'])].append(
                                                (None, ex[1]))
                    elif exon_cnt - 1 == idk:
                        #ex[1] = None
                        spliced_transcript[(gene_recd['name'], sub_rec[0],
                                            gene_recd['strand'])].append(
                                                (ex[0], None))
                    else:
                        spliced_transcript[(gene_recd['name'], sub_rec[0],
                                            gene_recd['strand'])].append(
                                                (ex[0], ex[1]))

                if orf_length < min_orf_length:  ## min orf length for the transcripts

                    del spliced_transcript[(
                        gene_recd['name'], sub_rec[0], gene_recd['strand']
                    )]  ## clearing that transcript details
                    orf_short += 1
                    continue

                spliced_cand += 1
            else:
                sing_exon_gen += 1
                #TODO orf length of the single exon gene will be good
                # to look, some histone genes are long enough to have
                # strong TSS region
                """
                single_exon_len = 0 
                for idk, ex in enumerate(gene_recd['exons'][idx]):
                    single_exon_len = ex[1]-(ex[0]-1)

                if single_exon_len > 1600:
                    spliced_transcript[(gene_recd['name'], sub_rec[0], gene_recd['strand'])].append((ex[0], ex[1]))
                """

        if spliced_transcript:
            transcripts_region[gene_recd['chr']].append(spliced_transcript)

    sys.stdout.write("...considering %d spliced transcripts\n" % spliced_cand)
    sys.stdout.write(
        "discarding transcripts...\n\t%d transcripts with single exon\n" %
        sing_exon_gen)
    sys.stdout.write(
        "\t%d transcripts with read coverage value less than 10\n" %
        transcript_cov)
    sys.stdout.write(
        "\t%d transcripts with orf region less than 400 nucleotides\n" %
        orf_short)

    genemodels = check_splice_site_consensus(fas_file, transcripts_region)

    write_filter_gene_models(gff_content, genemodels, outFile)