filter_novel_lincRNA.py

#!/usr/bin/python
#-*-coding : utf-8-*-
#Copyright(c) 2014 - SunLiang <sunliang@bioinfo.ac.cn>
from optparse import OptionParser
import sys
import math
import time
import os
from Gtf import Gtf
from Table import Table
from sets import Set


def make_dir(filepath):
    if not os.path.exists(filepath):
        os.makedirs(filepath)

def index_transform(row_data):
    id=row_data[0][1:]
    cnc_type=row_data[1]
    score=row_data[2].split(':')[1]
    start=row_data[3].split(':')[1]
    end=row_data[4].split(':')[1]
    length=row_data[5].split(':')[1]
    result=[id,cnc_type,score,start,end,length]
    return result

def check_lncRNA(data):
    score=float(data[2])
    length=int(data[5])
    exon_number=int(data[7])
    global TH_score
    global TH_length
    global exon_num
    if score < TH_score and length > TH_length and exon_number >= exon_num:
        return True
    else:
        return False

def sub_array(A,B):
    x=Set(A)
    y=Set(B)
    return list(x - y)

def intersect_array(A,B):
    x=Set(A)
    y=Set(B)
    return list(x & y)


def union_array(A,B):
    x=Set(A)
    y=Set(B)
    return list(x | y)

def de_redundency(A):
    return list(Set(A))


def classify(C,lnc,AM,discard):
    result=Table()
    result.key=1
    result.col_names=['gid','class']
    for i in C:
        result.data.append([i,'novel_coding'])
        result.row_names[i]=len(result.row_names)
    for i in lnc:
        result.data.append([i,'novel_lincRNA'])
        result.row_names[i]=len(result.row_names)
    for i in AM:
        result.data.append([i,'ambiguous_genes'])
        result.row_names[i]=len(result.row_names)
    for i in discard:
        result.data.append([i,'filter_out_noncoding'])
        result.row_names[i]=len(result.row_names)
    return result

usage="""

    filter_novel_lincRNA.py: filter the potentially novel lincRNAs to further pick out novel lincRNA genes
    Usage: filter_novel_lincRNA.py [-h] [-s 0] [-l 200] [-e 2] -i cnci_index -g unannotated_gtf -o out_dir

"""
parser=OptionParser(usage=usage)
parser.add_option('-i','--index',dest='index',action='store',help='(Required.) '
                 +'The path of coding/noncoding index file. '
                 +'This file is the output file of CNCI.py, named as CNCI.index under CNCI.py outdir')
parser.add_option('-g','--gtf',dest='gtf',action='store',help='(Required.) '
                 +'The path of potentially novel lincRNAs gtf file. '
                 +'This file is generated by compare.py, named as potentially_novel.gtf under compare.py outdir')
parser.add_option('-s','--score',dest='score',action='store',default=0,help='(Optional.) '
                 +'Threoshold of CNCI score. RNAs with score less than SCORE will be '
                 +'classified as noncoding. The Default is 0 .')
parser.add_option('-l','--length',dest='length',action='store',default=200,help='(Optional.) '
                 +'Minimal length of lincRNA. lincRNA with length >= LENGTH will be kept. '
                 +'The Default is 200. ')
parser.add_option('-e','--exon_num',dest='exon_num',action='store',default=2,help='(Optional.) '
                 +'Minimal exon number of lincRNA. lincRNA with exon number >= EXON_NUM '
                 +'will be kept.  The Default is 2.')
parser.add_option('-o','--out_dir',dest='out_dir',action='store',help='(Requried.) Output directory of the results.')
(options,args) = parser.parse_args()
if options.index is not None:
    index_file=options.index
else:
    print parser.print_help()
    exit("Error: Coding/noncoding index file is required!")
if options.gtf is not None:
    gtf_input=options.gtf
else:
    print parser.print_help()
    exit("Error:  Unannotated gtf file is required!")
if options.out_dir is not None:
    out_dir=options.out_dir
    out_dir=out_dir.rstrip('/')
else:
    print parser.print_help()
    exit("Error: Output directory is required!")

TH_score = float(options.score)
TH_length = int(options.length)
exon_num=int(options.exon_num)
start_time = time.time()
print "Run start:"
#######################################

make_dir(out_dir)
#gtf_outdir=out_dir+'/'+'gtf'
#tid_gid_file=gtf_outdir+"/"+"tid_gid"
#make_dir(gtf_outdir)
snc_gtf=out_dir+"/"+'filter_out_noncoding.gtf'
lnc_gtf=out_dir+"/"+'novel_lincRNA.gtf'
coding_gtf=out_dir+"/"+'novel_coding.gtf'
cnc_gtf=out_dir+"/"+'ambiguous_genes.gtf'
Gene_Info = out_dir + '/compare_2_infor.txt'
###################################
gtf=Gtf(gtf_input)
tid_gid=gtf.get_tid_gid()


#index_content=Table(index_file,0,False)
#index_content.write_to_file('index_content')
#transform_index_content=index_content.cal(index_transform,'row')
#transform_index=Table.build_table(1,['tid','cnc_type','score','start','end','length'],transform_index_content)
transform_index=Table(index_file,1,True)

transform_index.left_join(1,tid_gid,1)


gid_cnctype=transform_index.get_col('gid','index')

gid_cnctype=gid_cnctype.de_redundency()

gid_cnctype=gid_cnctype.key_by('gid','index')


coding=gid_cnctype.eget('union','_index=coding')

noncoding=gid_cnctype.eget('union','_index=noncoding')

ambiguity=gid_cnctype.eget('union','_index=coding,noncoding','_index=noncoding,coding')


exon_number=gtf.getExon()
transform_index.left_join(1,exon_number,1)

transform_index=transform_index.get_row_by_func(check_lncRNA)


all_gids=de_redundency(tid_gid.getCol('gid'))
C=coding.getCol('gid')
print "Novel coding genes:",len(C)
N=noncoding.getCol('gid')
AM=ambiguity.getCol('gid')
print "Ambiguous genes:",len(AM)
lnc=de_redundency(transform_index.getCol('gid'))
lnc=intersect_array(N,lnc)
print "Novel lincRNA genes:",len(lnc)
discard=sub_array(all_gids,C)
discard=sub_array(discard,AM)
discard=sub_array(discard,lnc)
print "Filter out noncoding genes:",len(discard)

gene_class=classify(C,lnc,AM,discard)
gene_class.set_colnames('gene_id','class')
gene_class.write_to_file(Gene_Info)

gtf.sub_gtf(gtf.get_tid(discard)).write_to_file(snc_gtf)
gtf.sub_gtf(gtf.get_tid(lnc)).write_to_file(lnc_gtf)
gtf.sub_gtf(gtf.get_tid(C)).write_to_file(coding_gtf)
gtf.sub_gtf(gtf.get_tid(AM)).write_to_file(cnc_gtf)

run_time=int(time.time() - start_time)
exit("Run complete: "+"%d seconds elapsed " % run_time)