Exemplo n.º 1
0
def parse():
    list_obj = []
    for line in open(ifile,'r'):
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            obj = classGene.GFF3(line)
        
            if obj.types() == 'mRNA':
                obj_gene = classGene.LongestCodingIsoform(line, obj)
                list_obj.append(obj_gene)
            if obj.types() == 'exon':
                obj_gene.addExon(line, obj)
            if obj.types() == 'CDS':
                obj_gene.addCDS(line, obj)
                
    hash_geneID = {}
    for obj in list_obj:
        id, parent, exon_length, cds_length = obj.getLongestIsoform()
        
        if parent in hash_geneID:
            if hash_geneID[parent][1] < cds_length:
                hash_geneID[parent][1] = cds_length
                hash_geneID[parent][2] = exon_length
                hash_geneID[parent][0] = id
            elif hash_geneID[parent][1] == cds_length:
                if hash_geneID[parent][2] < exon_length:
                    hash_geneID[parent][2] = exon_length
                    hash_geneID[parent][0] = id
        else:
            hash_geneID[parent] = [id, cds_length, exon_length]
            
    
    hash_tid = {}
    for item in hash_geneID:
        hash_tid[hash_geneID[item][0]] = ''
        
    
    ### process the GFF3 file
    print_flag = False
    for line in open(ifile,'r'):
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            obj = classGene.GFF3(line)
            if obj.types() == 'gene':
                print line
            elif obj.types() == 'mRNA':
                if str(obj) in hash_tid:
                    print line
                    print_flag = True
                else:
                    print_flag = False
            elif print_flag == True:
                print line
Exemplo n.º 2
0
def hash_combined(hash_combine_ID):
    hash_combine = {}
    count = 0
    for line in open(combined_gff3, 'r'):
        line = line.strip()
        count += 1
        if re.search('ID=', line):
            if len(line) > 0 and not line.startswith('#'):
                if count % 100000 == 0:
                    print 'Hashing combined: ', '{:9,.0f}'.format(count)
                obj = classGene.GFF3(line)
                if obj.types() == 'mRNA':
                    ID = str(obj)
                    if ID in hash_combine_ID:
                        hash_combine[ID] = {}
                        hash_combine[ID]['mRNA'] = line
                        hash_combine[ID]['exon'] = ''
                        hash_combine[ID]['CDS'] = ''
                        hash_lines = True
                    else:
                        hash_lines = False
                elif obj.types() == 'exon':
                    if hash_lines == True:
                        hash_combine[ID]['exon'] += (',' + line)
                elif obj.types() == 'CDS':
                    if hash_lines == True:
                        hash_combine[ID]['CDS'] += (',' + line)
    return hash_combine
Exemplo n.º 3
0
def hash_last_gff3_CDS(chromosome, start_min, start_max):
    CDS = {}
    count = 0
    exons = {}
    hash_flag = False
    for line in open(last_gff3, 'r'):
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            ### print the lines processed
            count += 1
            if count % 100000 == 0:
                print 'Hashing CDS: ', '{:9,.0f}'.format(count)
            ### check if it is gff3 format line
            if re.search('ID=', line):
                obj = classGene.GFF3(line)
                if obj.types() == 'mRNA':
                    if obj.seqids() == chromosome and (start_min <= int(
                            obj.starts()) < start_max):
                        ID = str(obj)
                        CDS[ID] = 0
                        exons[ID] = []
                        hash_flag = True
                    else:
                        hash_flag = False
                if obj.types() == 'exon' and obj.seqids(
                ) == chromosome and hash_flag == True:
                    exons[ID].append((int(obj.starts()), int(obj.ends())))
                if obj.types() == 'CDS' and obj.seqids(
                ) == chromosome and hash_flag == True:
                    if ID in CDS:
                        CDS[ID] += (int(obj.ends()) - int(obj.starts()) + 1)

    return exons, CDS
def hash_last_gff3_CDS(chromosome):
    CDS = {}
    count = 0
    exons = {}
    last_gff3_temp = last_gff3 + '.temp'
    os.system('grep ' + chromosome + "'\t' " + last_gff3 + ' > ' +
              last_gff3_temp)
    for line in open(last_gff3_temp, 'r'):
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            ### print the lines processed
            count += 1
            if count % 100000 == 0:
                print 'Hashing CDS: ', '{:9,.0f}'.format(count)
            ### check if it is gff3 format line
            if re.search('ID=', line):
                obj = classGene.GFF3(line)
                if obj.types() == 'mRNA' and obj.seqids() == chromosome:
                    ID = str(obj)
                    CDS[ID] = 0
                    exons[ID] = []
                if obj.types() == 'exon' and obj.seqids() == chromosome:
                    exons[ID].append((int(obj.starts()), int(obj.ends())))
                if obj.types() == 'CDS' and obj.seqids() == chromosome:
                    if ID in CDS:
                        CDS[ID] += (int(obj.ends()) - int(obj.starts()) + 1)

    os.system('rm ' + last_gff3_temp)
    return exons, CDS
Exemplo n.º 5
0
def printOut(Hash):

    for line in open(gff3, 'r'):
        if len(line) > 0 and not line.startswith('#'):
            obj = classGene.GFF3(line)
            if obj.types() == FeatureType:
                if obj.seqids() in Hash:
                    print obj
Exemplo n.º 6
0
def parseGFF3(EC, Function, Name, GeneComment, ProductType, GO, SYNONYM,
              header, pf_out, g):
    ### Attributes
    #attributes = ['ID', 'NAME', 'STARTBASE', 'ENDBASE', 'PRODUCT-TYPE','SYNONYM','GENE-COMMENT','FUNCTION','EC','GO','DBLINK','//']
    genetic_elem_write = True
    for line in open(gff3, 'r'):
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            obj = classGene.GFF3(line)
            if obj.types() == 'mRNA' and obj.seqids() == header:
                t_id = str(obj)
                if genetic_elem_write == True:
                    g.write('ID\t' + str(header) + '\n')
                    g.write('Name\t' + str(header) + '\n')
                    g.write('TYPE\t:CHRSM\n')
                    g.write('CIRCULAR?\tN\n')
                    g.write('ANNOT-FILE\t' + header + '.pf\n')
                    g.write('SEQ-FILE\t' + header + '.fa\n')
                    g.write('//' + '\n')
                    genetic_elem_write = False

                pf_out.write('ID\t' + str(t_id) + '\n')
                pf_out.write(attributes[1] + '\t' +
                             '.'.join(t_id.split('.')[:-1]) + '\n')
                pf_out.write(attributes[2] + '\t' + str(obj.starts()) + '\n')
                pf_out.write(attributes[3] + '\t' + str(obj.ends()) + '\n')
                if t_id in ProductType:
                    pf_out.write(attributes[4] + '\t' +
                                 str(ProductType[t_id]) + '\n')
                else:
                    pf_out.write(attributes[4] + '\t' + str('P') + '\n')
                if t_id in SYNONYM:
                    pf_out.write(attributes[5] + '\t' + str(SYNONYM[t_id]) +
                                 '\n')
                if t_id in Name:
                    if t_id in GeneComment:
                        pf_out.write(attributes[6] + '\t' +
                                     str(GeneComment[t_id]) + '\n')
                    if t_id in Function:
                        pf_out.write(attributes[7] + '\t' +
                                     str(Function[t_id]) + '\n')
                    if t_id in EC:
                        for j in range(len(EC[t_id])):
                            ec_no = EC[t_id][j].replace('EC-', '')
                            if len(ec_no.split('.')) == 3:
                                ec_no += '.-'
                            pf_out.write(attributes[8] + '\t' +
                                         str(ec_no + '\n'))
                if t_id in GO:
                    for j in range(len(GO[t_id])):
                        pf_out.write(attributes[9] + '\t' + str(GO[t_id][j]) +
                                     '\n')
                        pf_out.write(attributes[10] + '\tGO:' +
                                     str(GO[t_id][j]).split('|')[1] + '\n')
                pf_out.write(attributes[11] + '\n')
    pf_out.close()
Exemplo n.º 7
0
def hashGFF3(chrom):
    cds_coords = {}
    for line in open(ifile, 'r'):
        line = line.strip()
        if len(line) > 1 and line.startswith(chrom):
            obj = classGene.GFF3(line)
            if obj.types() == 'CDS':
                for i in range(int(obj.starts()), int(obj.ends()) + 1, 1):
                    cds_coords[i] = ''
    return cds_coords
Exemplo n.º 8
0
def parse(hash_anno):
    
    for line in open(ifile,'r'):
        line = line.strip()
        if len(line)>0 and not line.startswith('#'):
            obj = classGene.GFF3(line)
            
            if obj.types() == 'mRNA':
                print line + ';'+'Annotation="'+hash_anno[str(obj)].replace(',','')+'"'
            else:
                print line
def parse():
    print_flag = False
    for line in open(infile, 'r'):
        line = line.strip()
        obj = classGene.GFF3(line)
        if obj.types() == 'gene':
            if re.search("Name=CUFF", line):
                print_flag = True
            else:
                print_flag = False
        if print_flag == True:
            print line
Exemplo n.º 10
0
def parse():
    print_flag = False
    for line in open(infile, 'r'):
        line = line.strip()
        obj = classGene.GFF3(line)
        if obj.types() == 'gene':
            token = line.split('\t')
            if token[1] == "CUFFLINKS":
                print_flag = True
            else:
                print_flag = False
        if print_flag == True:
            print line
Exemplo n.º 11
0
def hash_evidences(chromosome, exons, CDS, start_min, start_max):
    evidences = {}  ### hash exonic co-ordinates by evidence
    CDS_transcript = {}
    exons_transcript = {}
    count = 0
    hash_flag = False
    for line in open(evidences_gff3, 'r'):
        line = line.strip()
        count += 1
        if len(line) > 0 and not line.startswith('#'):
            if count % 100000 == 0:
                print 'Hashing evidence: ', '{:9,.0f}'.format(count)
            ### check if it is gff3 format line
            if re.search('ID=', line):
                obj = classGene.GFF3(line)
                if obj.types() == 'mRNA':
                    if obj.seqids() == chromosome and (start_min <= int(
                            obj.starts()) < start_max):
                        ID = str(obj)
                        CDS_transcript[ID] = 0
                        exons_transcript[ID] = []
                        hash_flag = True
                        ### print the lines processed
                        if obj.sources() not in evidences:
                            evidences[obj.sources()] = {}
                        for i in range(
                                int(obj.starts()) - int(gene_size_difference) -
                                1,
                                int(obj.starts()) + int(gene_size_difference) +
                                1):
                            evidences[obj.sources()][i] = str(obj)
                        for i in range(
                                int(obj.ends()) - int(gene_size_difference) -
                                1,
                                int(obj.ends()) + int(gene_size_difference) +
                                1):
                            evidences[obj.sources()][i] = str(obj)
                    else:
                        hash_flag = False
                if obj.types() == 'exon' and obj.seqids() == chromosome:
                    if hash_flag == True:
                        exons_transcript[ID].append(
                            (int(obj.starts()), int(obj.ends())))
                if obj.types() == 'CDS' and obj.seqids() == chromosome:
                    if hash_flag == True:
                        if ID in CDS_transcript:
                            CDS_transcript[ID] += (int(obj.ends()) -
                                                   int(obj.starts()) + 1)

    find_gene_overlaps(evidences, exons_transcript, CDS_transcript, exons, CDS,
                       chromosome, start_min, start_max)
Exemplo n.º 12
0
def print_intron(file):
    last_parent_ID = ''
    last_end = ''
    
    for line in open(file,'r'):
        line = line.strip()
        if len(line) > 1 and not line.startswith('#'):
            obj = classGene.GFF3(line)
            
            if obj.types() == 'exon':
                if  last_parent_ID == get_PARENT(line):
                    print abs(last_end - int(obj.starts()))
                last_parent_ID = get_PARENT(line)
                last_end = int(obj.ends())
Exemplo n.º 13
0
def hash_gff3():
    HASH_GFF3 = {}
    for line in open(GFF3, 'r'):
        line = line.strip()
        obj = classGene.GFF3(line)

        if obj.types() == 'mRNA':
            try:
                anno = re.search(r'Annotation=".+"',
                                 line).group(0).split('"')[1]
                g_id = (get_PARENT(line)).replace('clover_', 'occidentale_')
                HASH_GFF3[g_id] = anno.split('|')[4]
            except:
                continue
    return HASH_GFF3
Exemplo n.º 14
0
def hash_gff3(chromosome):
    HASH_GFF3 = {}
    for line in open(GFF3, 'r'):
        line = line.strip()
        obj = classGene.GFF3(line)

        if obj.types() == 'mRNA' and obj.seqids() == chromosome:
            try:
                anno = re.search(r'Annotation=".+"',
                                 line).group(0).split('"')[1]
                for i in range(int(obj.starts()), int(obj.ends())):
                    if i not in HASH_GFF3:
                        HASH_GFF3[i] = anno
            except:
                continue
    return HASH_GFF3
Exemplo n.º 15
0
def change_gff3(prior_n):
    for line in open(infile, 'r'):
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            obj = classGene.GFF3(line)

            string = obj.seqids() + '\t' + \
            str(prior_n) + '\t' + \
            obj.types() + '\t' + \
            obj.starts() + '\t' + \
            obj.ends() + '\t' + \
            obj.scores() + '\t' + \
            obj.strands() + '\t' + \
            obj.phases() + '\t' + \
            obj.attributes()

            print string
Exemplo n.º 16
0
def parse(gff3):
                 
    gene_objs = []  
    for line in open(gff3,'r'):
        line = line.strip()    
        obj = classGene.GFF3(line)
        if obj.types()=="gene":
            LIobj = classGene.LongestIsoform(obj)
            gene_objs.append(LIobj)
        elif obj.types()=="mRNA":
            LIobj.add_mRNA(line, obj)
        else:
            LIobj.add_feature(line)
            
    for obj in gene_objs:
        print str(obj)
        print obj.features
Exemplo n.º 17
0
def get_exon_fraction(chrom, hash_call):
    HEADER = 'Lj30_ID\tExonLength\tCDSLength\tCallableExon\tCallableCDS'
    obj_list = []
    for line in open(ifile, 'r'):
        if len(line) > 1 and not line.startswith('#'):
            line = line.strip()
            obj = classGene.GFF3(line)
            if obj.types() == "mRNA":
                obj_mRNA = classmRNA.mRNA(line, obj)
                obj_list.append(obj_mRNA)
            if obj.types() == "mRNA" or obj.types() == "exon" or obj.types(
            ) == "CDS":
                obj_mRNA.AddData(line, obj)
    print HEADER
    for obj_mRNA in obj_list:
        print str(obj_mRNA), obj_mRNA.GetExonLength(), obj_mRNA.GetCDSLength(
        ), obj_mRNA.GetExonicOverlap(hash_call), obj_mRNA.GetCDSOverlap(
            hash_call)
def find_high_gene_density(chromosome, count, avg_gd):
    o_frag = open(infile + '.' + chromosome + '.frags.temp', 'w')
    last_start = -100000
    new_block = True
    correct_chro = False
    for line in open(infile, 'r'):
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            obj = classGene.GFF3(line)

            if obj.seqids() == chromosome:
                correct_chro = True
                if obj.types() == 'gene':
                    start = int(obj.starts())

                    if start - last_start < max_dist:
                        if max_size > int(obj.ends()) - int(obj.starts()):
                            region += int(obj.ends()) - int(obj.starts())
                            gene_count += 1
                            gene_id.append(str(obj))
                            new_block = False
                    else:

                        if new_block == False:
                            if gene_count * 1000 / float(
                                    gene_count
                            ) >= 5 * avg_gd and gene_count > 2:
                                print chromosome, block_start
                                for g_id in gene_id:
                                    o_frag.write(g_id + '\n')

                        gene_count = 0
                        region = int(obj.ends()) - int(obj.starts())
                        new_block = True
                        gene_id = []
                        if max_size > int(obj.ends()) - int(obj.starts()):
                            gene_id.append(str(obj))
                            gene_count += 1
                        block_start = int(obj.starts())

                    last_start = start
            if correct_chro == True and obj.seqids() != chromosome:
                break
    o_frag.close()
def hash_evidences(chromosome, exons, CDS):
    evidences = {}  ### hash exonic co-ordinates by evidence
    CDS_transcript = {}
    exons_transcript = {}
    count = 0
    evidences_gff3_temp = evidences_gff3 + '.temp'
    os.system('grep ' + chromosome + "'\t' " + evidences_gff3 + ' > ' +
              evidences_gff3_temp)
    for line in open(evidences_gff3_temp, 'r'):
        line = line.strip()
        count += 1
        if len(line) > 0 and not line.startswith('#'):
            if count % 100000 == 0:
                print 'Hashing evidence: ', '{:9,.0f}'.format(count)
            ### check if it is gff3 format line
            if re.search('ID=', line):
                obj = classGene.GFF3(line)
                if obj.types() == 'mRNA' and obj.seqids() == chromosome:
                    ID = str(obj)
                    CDS_transcript[ID] = 0
                    exons_transcript[ID] = []
                    ### print the lines processed
                    if obj.sources() not in evidences:
                        evidences[obj.sources()] = {}
                    for i in range(
                            int(obj.starts()) - int(gene_size_difference) - 1,
                            int(obj.starts()) + int(gene_size_difference) + 1):
                        evidences[obj.sources()][i] = str(obj)
                    for i in range(
                            int(obj.ends()) - int(gene_size_difference) - 1,
                            int(obj.ends()) + int(gene_size_difference) + 1):
                        evidences[obj.sources()][i] = str(obj)
                if obj.types() == 'exon' and obj.seqids() == chromosome:
                    exons_transcript[ID].append(
                        (int(obj.starts()), int(obj.ends())))
                if obj.types() == 'CDS' and obj.seqids() == chromosome:
                    if ID in CDS_transcript:
                        CDS_transcript[ID] += (int(obj.ends()) -
                                               int(obj.starts()) + 1)

    find_gene_overlaps(evidences, exons_transcript, CDS_transcript, exons, CDS,
                       chromosome)
    os.system('rm ' + evidences_gff3_temp)
Exemplo n.º 20
0
def hash_GFF3(chromosome):

    count = 0
    coords = {}
    for line in open(gff3, 'r'):
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            obj = classGene.GFF3(line)
            if obj.seqids() == chromosome:
                count += 1
                if count % 1000 == 0:
                    print 'Number of lines processed: ', chromosome, '{:9,.0f}'.format(
                        count)
                for i in range(int(obj.starts()), int(obj.ends()) + 1):
                    if obj.types() == 'mRNA':
                        coords[i] = 'mRNA'
                    if obj.types() == 'exon':
                        coords[i] = 'exon'
                    if obj.types() == 'CDS':
                        coords[i] = 'CDS'
    return coords
def find_avg_gene_density(chromosome, count, gene_count_hash):
    correct_chro = False
    gene_count = 0
    for line in open(infile, 'r'):
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            obj = classGene.GFF3(line)

            count += 1
            if count % 10000 == 0:
                print 'Number of lines processed: ', chromosome, '{:9,.0f}'.format(
                    count)

            if obj.seqids() == chromosome:
                correct_chro = True
                if obj.types() == 'gene':
                    gene_count += 1
            if correct_chro == True and obj.seqids() != chromosome:
                break

    gene_count_hash[chromosome] = gene_count
Exemplo n.º 22
0
def parseGFF3(EC, Function, Name, GeneComment, ProductType, GO, SYNONYM, pf_out, g):
    ### Attributes
    #attributes = ['ID', 'NAME', 'STARTBASE', 'ENDBASE', 'PRODUCT-TYPE','SYNONYM','GENE-COMMENT','FUNCTION','EC','GO','DBLINK','//']
    seq = {}
    for line in open(gff3, 'r'):
        line = line.strip()
        if len(line)>0 and not line.startswith('#'):
            obj = classGene.GFF3(line)                
            if obj.seqids() not in seq:
                header = obj.seqids()
                seq[obj.seqids()] = ''
                t_id = str(obj)
                g.write('ID\t'+str(header)+'\n')
                g.write('Name\t'+str(header)+'\n')
                g.write('TYPE\t:CHRSM\n')
                g.write('CIRCULAR?\tN\n')
                g.write('ANNOT-FILE\t'+header+'.pf\n')
                g.write('SEQ-FILE\t'+header+'.fsa\n')
                g.write('//'+'\n')
                        
                '''
Exemplo n.º 23
0
def hashGFF3(chrom):
    cds_coords = {}
    cds_bound = {}
    for line in open(ifile,'r'):
        line = line.strip()
        if len(line) > 1 and line.startswith(chrom):
            obj = classGene.GFF3(line)
            if obj.types() == 'CDS' and obj.get_parent() == candidate:
                cds_bound[int(obj.starts())] = ''
                cds_bound[int(obj.ends())] = ''
                
                for i in range(int(obj.starts()), int(obj.ends())+1, 1):
                    cds_coords[i] = ''
                    

    
    if len(cds_coords) != 0:
        if obj.strands() == '+':
            start = min(cds_bound)
        else:
            start = max(cds_bound)
        hashAlignment(chrom, cds_coords, start)
Exemplo n.º 24
0
def get_exon_fraction(chrom, hash_call):
    first_transcript = True
    hash_exon = {}
    hash_cds = {}
    HEADER = 'Lj30_ID\tExonLength\tCDSLength\tCallableExon\tCallableCDS'
    print HEADER
    for line in open(ifile, 'r'):
        if len(line) > 1 and not line.startswith('#'):
            line = line.strip()
            obj = classGene.GFF3(line)
            if obj.seqids() == chrom:
                if obj.types() == "mRNA":
                    if first_transcript == False:
                        exon_len = len(hash_exon)
                        cds_len = len(hash_cds)
                        exon_call_len = 0
                        cds_call_len = 0
                        for i in hash_exon:
                            if i in hash_call:
                                exon_call_len += 1
                        for i in hash_cds:
                            if i in hash_call:
                                cds_call_len += 1
                        print id + '\t' + str(exon_len) + '\t' + str(
                            exon_call_len) + '\t' + str(cds_len) + '\t' + str(
                                cds_call_len)
                    first_transcript = False
                    hash_exon = {}
                    hash_cds = {}
                    id = str(obj)
                elif obj.types() == "exon":
                    for i in range(int(obj.starts()), int(obj.ends())):
                        hash_exon[i] = ''
                elif obj.types() == "CDS":
                    for i in range(int(obj.starts()), int(obj.ends())):
                        hash_cds[i] = ''
    print id + '\t' + str(exon_len) + '\t' + str(exon_call_len) + '\t' + str(
        cds_len) + '\t' + str(cds_call_len)
Exemplo n.º 25
0
def hash_coords(file, chr):
    coords_dis={}
    coords_mRNA_len = {}
    exons = []
    first_gene = True
    for line in open(file, 'r'):
        if len(line) > 1 and not line.startswith('#'):
            obj = classGene.GFF3(line)
            if obj.seqids() == chr:
                if obj.types() == "mRNA":
                    strand =  obj.strands()
                    if first_gene == False:
                        coords_dis, coords_mRNA_len = hash_coords_mRNA(coords_dis, exons, strand, coords_mRNA_len)
                        exons = []
                    first_gene = False
                if obj.types() == "exon":
                    exons.append(int(obj.starts()))
                    exons.append(int(obj.ends()))
            
    ### for last mRNA
    coords_dis, coords_mRNA_len = hash_coords_mRNA(coords_dis, exons, strand, coords_mRNA_len)
    
    return coords_dis, coords_mRNA_len
Exemplo n.º 26
0
def hashGFF3(chrom, align_hash):
    cds_coords = {}
    cds_bound = {}
    first_transcript = True
    for line in open(ifile,'r'):
        line = line.strip()
        if len(line) > 1 and line.startswith(chrom):
            obj = classGene.GFF3(line)
            if obj.types() == 'CDS':
                
                for i in range(int(obj.starts()), int(obj.ends())+1, 1):
                    cds_coords[i] = ''
                    

            if obj.types() == 'mRNA':
                if first_transcript == False:
                    processTranscript(chrom, cds_coords, tid, align_hash)
                tid = str(obj)
                cds_coords = {}
                first_transcript = False
    
    if len(line) > 1 and line.startswith(chrom):           
        processTranscript(chrom, cds_coords, tid, align_hash)
def hash_annotations(gff3, chro):
    exons = {}
    cds = {}
    utr = {}
    intron = {}
    inter = {}
    mRNA_type = {}
    first_line = True
    last_end = 0
    last_exon_end = 0
    parent_id = ''

    for line in open(gff3, 'r'):
        line = line.strip()
        if len(line) > 1 and not line.startswith('#'):
            obj = classGene.GFF3(line)
            if obj.seqids() == chro:
                if obj.types() == 'mRNA':

                    if first_line == False:
                        exons, cds, utr, intron, mRNA_type = addCoords(
                            gs, exons, cds, utr, intron, mRNA_type)

                    first_line = False
                    gs = classGeneStructure.GeneStructure(obj)
                    gs.addmRNA(obj)

                if obj.types() == 'exon':
                    gs.addexon(obj)

                if obj.types() == 'CDS':
                    gs.addcds(obj)

    exons, cds, utr, intron, mRNA_type = addCoords(gs, exons, cds, utr, intron,
                                                   mRNA_type)

    return exons, cds, utr, intron, mRNA_type
Exemplo n.º 28
0
def find_gene_overlaps(evidences, exons_transcript, CDS_transcript, exons, CDS,
                       chromosome, start_min, start_max):
    count = 0
    for line in open(last_gff3, 'r'):
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            count += 1
            if count % 10000 == 0:
                print 'find_gene_overlaps: ', '{:9,.0f}'.format(count)
            obj = classGene.GFF3(line)
            if obj.types() == 'mRNA' and obj.seqids() == chromosome and (
                    start_min <= int(obj.starts()) < start_max):
                ID = str(obj)
                ### find the best replacement
                delta_CDS = []
                evi_list = []
                larger_found = False
                for i in evidences:
                    if int(obj.starts()) in evidences[i] and int(
                            obj.ends()) in evidences[i]:
                        if ID != evidences[i][int(
                                obj.starts())] and ID != evidences[i][int(
                                    obj.ends())]:
                            evidence_id = evidences[i][int(obj.starts())]
                            if CDS[ID] + float(min_coding_differences) * CDS[
                                    ID] < CDS_transcript[evidence_id] and CDS[
                                        ID] >= 0:
                                ### make sure that two IDs are isoformorms
                                if '.'.join(ID.split('.')[:-1]) != '.'.join(
                                        evidence_id.split('.')[:-1]):
                                    ### check the exonic difference
                                    set_exons_transcript = {}
                                    for (start,
                                         end) in exons_transcript[evidence_id]:
                                        for j in range(start, end + 1):
                                            set_exons_transcript[j] = ''
                                    set_exons_transcript = set(
                                        set_exons_transcript)
                                    set_exons = {}
                                    for start, end in exons[ID]:
                                        for j in range(start, end + 1):
                                            set_exons[j] = ''
                                    set_exons = set(set_exons)
                                    if abs(
                                            int(
                                                len(set_exons_transcript -
                                                    set_exons))
                                    ) < int(max_exonic_differences):
                                        delta_CDS.append(
                                            CDS_transcript[evidence_id] -
                                            CDS[ID])
                                        evi_list.append(
                                            ID + '\t' + evidence_id + '\t' +
                                            i + '\t' + str(obj.seqids()) +
                                            '\t' + str(obj.starts()) + '\t' +
                                            str(CDS_transcript[evidence_id] -
                                                CDS[ID]) + '\t' + str(
                                                    len(set_exons_transcript -
                                                        set_exons)))
                                        larger_found = True
                if larger_found == True:
                    outfile.write(evi_list[delta_CDS.index(min(delta_CDS))] +
                                  '\n')
Exemplo n.º 29
0
def FixBoundries():
    first_gene = True
    gene = []
    for line in open(infile, 'r'):
        line = line.strip()
        if len(line) > 0 and not line.startswith('#'):
            obj = classGene.GFF3(line)
            if obj.types() == 'gene':
                if first_gene == False:
                    obj = classGene.GFF3(gene[0])
                    print obj.seqids() + '\t' + \
                        obj.sources() + '\t' + \
                        obj.types() + '\t' + \
                        str(gene_start) + '\t' + \
                        str(gene_end) + '\t' + \
                        obj.scores() + '\t' + \
                        obj.strands() + '\t' + \
                        str(obj.phases()) + '\t' + \
                        obj.attributes()
                    for l in gene[1:]:
                        print l
                gene = []
                obj = classGene.GFF3(line)
                gene.append(line)
                gene_start = int(obj.starts())
                gene_end = int(obj.ends())
                first_gene = False

            elif obj.types() == 'mRNA':
                gene.append(line)
                if gene_start > int(obj.starts()):
                    gene_start = int(obj.starts())
                if int(obj.ends()) > gene_end:
                    gene_end = int(obj.ends())

                mRNA_start = int(obj.starts())
                mRNA_end = int(obj.ends())
                mRNA_coord = {}
                for i in range(int(obj.starts()), int(obj.ends()) + 1):
                    mRNA_coord[i] = ''
            else:
                if int(obj.starts()) not in mRNA_coord:
                    line = obj.seqids() + '\t' + \
                    obj.sources() + '\t' + \
                    obj.types() + '\t' + \
                    str(int(mRNA_start)) + '\t' + \
                    obj.ends() + '\t' + \
                    obj.scores() + '\t' + \
                    obj.strands() + '\t' + \
                    str(obj.phases()) + '\t' + \
                    obj.attributes()
                if int(obj.ends()) not in mRNA_coord:
                    line = obj.seqids() + '\t' + \
                    obj.sources() + '\t' + \
                    obj.types() + '\t' + \
                    obj.starts() + '\t' + \
                    str(mRNA_end) + '\t' + \
                    obj.scores() + '\t' + \
                    obj.strands() + '\t' + \
                    str(obj.phases()) + '\t' + \
                    obj.attributes()

                gene.append(line)

    obj = classGene.GFF3(gene[0])
    print obj.seqids() + '\t' + \
        obj.sources() + '\t' + \
        obj.types() + '\t' + \
        str(gene_start) + '\t' + \
        str(gene_end) + '\t' + \
        obj.scores() + '\t' + \
        obj.strands() + '\t' + \
        str(obj.phases()) + '\t' + \
        obj.attributes()

    for l in gene[1:]:
        print l
Exemplo n.º 30
0
def make_gff3(reaplcement_IDs, hash_combine):
    out = open(last_gff3 + '.replaced', 'w')
    count = 0
    for line in open(last_gff3, 'r'):
        line = line.strip()
        count += 1
        if len(line) > 0 and not line.startswith('#'):
            if count % 100000 == 0:
                print 'Printing final GFF3: ', '{:9,.0f}'.format(count)
            obj = classGene.GFF3(line)
            if obj.types() == "gene":
                out.write(line + '\n')
                source = obj.sources()
                g_id = str(obj)
            elif obj.types() == "mRNA":
                ID = str(obj)
                if ID in reaplcement_IDs:
                    token = hash_combine[reaplcement_IDs[ID]]['mRNA'].split(
                        '\t')
                    if obj.seqids() == token[0]:
                        print_flag = False
                        ### print new mRNA
                        out.write(token[0] + '\t' + source + '\t' + token[2] +
                                  '\t' + token[3] + '\t' + token[4] + '\t' +
                                  token[5] + '\t' + token[6] + '\t' +
                                  token[7] + '\t' +
                                  (token[8].split("Parent=")[0]
                                   ).replace(reaplcement_IDs[ID], ID) +
                                  "Parent=" + g_id + ";Name=" + ID + '\n')

                        ### print exon lines of the lines
                        for i in hash_combine[
                                reaplcement_IDs[ID]]['exon'].split(',')[1:]:
                            i = i.replace(reaplcement_IDs[ID], ID)
                            token = i.split('\t')
                            out.write(token[0]+'\t'+ \
                                      source + '\t' + \
                                      '\t'.join(token[2:])+'\n')

                        ### print CDS lines of the lines
                        CDS_count = 0
                        for i in hash_combine[
                                reaplcement_IDs[ID]]['CDS'].split(',')[1:]:
                            CDS_count += 1
                            i = i.replace(reaplcement_IDs[ID], ID)
                            token = i.split('\t')
                            out.write(
                                token[0] + '\t' + source + '\t' + token[2] +
                                '\t' + token[3] + '\t' + token[4] + '\t' +
                                token[5] + '\t' + token[6] + '\t' + token[7] +
                                '\t' +
                                (token[8].replace(reaplcement_IDs[ID], ID)
                                 ).split("ID=")[0] + "ID=" + ID + '.CDS.' +
                                str(CDS_count) + ";Parent=" + ID + '\n')
                else:
                    print_flag = True
                    out.write(line + '\n')
            else:
                if print_flag == True:
                    out.write(line + '\n')
    out.close()