예제 #1
0
def assign_utrornongene3p_inwpcbgs(inwpcbgs,verbose=True):
    """ """
    # return empty list when no inwpcbgs applied
    if not inwpcbgs: return []

    # get most likely first & final inwpCBG pointer in inwpcbgs list
    posFirst,posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs)

    # return variable list
    noncoding_inwpcbg_list = []

    # get data of most likely first inwpCBG
    max_cntAnnot        = max([inwp.count_orfs_labeled_as_annotated_exon() for inwp in inwpcbgs ])
    finalInwpCBG        = inwpcbgs[posFinal]
    final_cnt_is_final  = finalInwpCBG.count_orfs_labeled_as_final_exon()
    final_identityscore = finalInwpCBG.get_identityscore()
    final_prjtls_aadif  = finalInwpCBG.get_projected_tailing_stop_aa_difference()
    final_prjtls_nonad  = finalInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()

    # range of inwpCBGs which are checked for deletion
    range_3p_test = range(posFinal+1,len(inwpcbgs))

    # detect UTR or nongene / noncoding inwpCBGS
    for pos in range(0,len(inwpcbgs)):
        if pos not in range_3p_test: continue

        # get this inwpCBG and get statistics
        inwpCBG = inwpcbgs[pos]
        cntFinal = inwpCBG.count_orfs_labeled_as_first_exon()
        # calculated differntly as for *the* firstCBG
        cntAnnot = float(inwpCBG.organism_set_size())

        # break when to putatively first is reached
        if cntFinal == final_cnt_is_final: break

        # remove poorly covered inwpCBGs with low identityscore and not
        # having a likely stop codon
        if cntAnnot/max_cntAnnot < 0.80 and\
        inwpCBG.get_identityscore() / final_identityscore <=\
        NONCODINGNONGENE_3p_INWPCBG_MAX_IDENTITYRATIO and\
        inwpCBG.organism_set_size() < final_cnt_is_final and\
        final_prjtls_aadif < inwpCBG.get_projected_tailing_stop_aa_difference() and\
        final_prjtls_nonad < inwpCBG.get_projected_tailing_stop_nonaligned_aa_difference():
            # not contribution to the gene structure at all....
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

        # check relative position towards the current finalInwpCBG
        # position is measured in actual nt distance and tcode 'distance':
        # the lowest scoring TCODE window in between these inwpCBGs
        ntdistdict    = finalInwpCBG.nt_spacing_between_codingblocks([inwpCBG])
        tcodedistdict = finalInwpCBG.tcode_spacing_between_codingblocks([inwpCBG])

        # remove highest & lowest distance and then do stats on remaining dists
        if len(ntdistdict) >= 3:
            _tmp = [ (v,k) for k,v in ntdistdict.iteritems() ]
            _tmp.sort()
            del( ntdistdict[_tmp[0][1]] )
            del( ntdistdict[_tmp[-1][1]] )
        if len(tcodedistdict) >= 3:
            _tmp = [ (v,k) for k,v in tcodedistdict.iteritems() ]
            _tmp.sort()
            del( tcodedistdict[_tmp[0][1]] )
            del( tcodedistdict[_tmp[-1][1]] )

        # do 3 checks.
        # 1) are average and maximum intergenic distances are bridged?
        # 2) is stop codon projection a deterioration?
        # 4) does tcodedistance suggest bridging of a non-coding stretch?
        check_1 = sum(ntdistdict.values())/len(ntdistdict) >=\
                  AVERAGE_INTERGENIC_MIN_NT_LENGTH and\
                  max(ntdistdict.values()) >= MAX_INTERGENIC_MIN_NT_LENGTH
        check_2 = final_prjtls_aadif <\
                  inwpCBG.get_projected_tailing_stop_aa_difference() and\
                  final_prjtls_nonad <\
                  inwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()
        check_3 = len(tcodedistdict)>0 and\
                  sum(tcodedistdict.values())/len(tcodedistdict) <=\
                  TCODE_MAX_NONCODING

        if [ check_1, check_2, check_3 ].count(True) >= 2:
            # not contribution to the gene structure at all....
            noncoding_inwpcbg_list.append(inwpCBG)
            continue


        # do is_coding() test
        iscoding = inwpCBG.is_coding()

        ########################################################################
        if verbose:
            print pos, "3'UTR analyses:", inwpCBG, iscoding,
            print cntAnnot/max_cntAnnot
        ########################################################################

        if not iscoding:
            # probably non-coding inwp CBG alignment block
            noncoding_inwpcbg_list.append(inwpCBG)
            continue


    # return the noncoding_inwpcbg_list
    return noncoding_inwpcbg_list
예제 #2
0
def assign_gtgdiscrepancy_inwpcbgs(inwpcbgs,GTG,exclude_annotated=True,verbose=True):
    """ """
    # return empty list when no inwpcbgs applied
    if not inwpcbgs: return []

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # return list with inwpcbgs
    gtgdiscrepancy_inwpcbg_list = []

    if exclude_annotated:
        # get most likely first & final inwpCBG pointer in inwpcbgs list
        posFirst,posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs)
        range_5p_test = range(0,posFirst)
        range_3p_test = range(posFinal+1,len(inwpcbgs))
        protected_target_orfid_list = []
        for inwpCBG in inwpcbgs[posFirst:posFinal+1]:
            if inwpCBG.count_orfs_labeled_as_annotated_exon() > 0:
                protected_target_orfid_list.append( inwpCBG.get_orfs_of_graph(organism=target)[0].id )
    else:
        range_5p_test = []
        range_3p_test = []
        protected_target_orfid_list = []

    ############################################################################
    if verbose and exclude_annotated:
        print "NOT-excluded:", range_5p_test, range_3p_test
    ############################################################################

    # detect UTR or nongene / noncoding inwpCBGS
    for pos in range(0,len(inwpcbgs)):
        if exclude_annotated and pos in range_5p_test:
            pass
        elif exclude_annotated and pos in range_3p_test:
            pass
        elif exclude_annotated and inwpcbgs[pos].count_orfs_labeled_as_annotated_exon() == 0:
            # in the middle of the annotated geen structure, but not a single
            # Orf annotated as an exon. Asses for gtg difference too!
            pass
        elif exclude_annotated:
            continue
        else:
            pass


        # get this inwpCBG and 
        thisInwpCBG = inwpcbgs[pos]

        # ignore if the target's Orf is belonging to a `protected` Orf
        if protected_target_orfid_list and\
        thisInwpCBG.get_orfs_of_graph(organism=target)[0].id in\
        protected_target_orfid_list:
            continue

        # ignore inwpCBGs which are very likely (poor quality) SignalP alignments
        cntSP = float(thisInwpCBG.count_orfs_with_signalpeptides())
        if cntSP/(thisInwpCBG.count_genomic_informants()+1) > 0.66 and\
        thisInwpCBG.get_signalp_score() > 0.75:
            continue

        # create its GeneTreeGraph
        gtg = pcg2gtg_by_identity(thisInwpCBG,target)

        # step 1. Do the gtg/GTG difference check
        difference = _relative_gtg_difference(gtg,GTG,target)

        if difference < NONGENE_GTG_MAX_DIFFERENCE:
            # step 2. Do the CEXPANDER check
            if thisInwpCBG.node_count() <= 2:
                gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                ################################################################
                if verbose:
                    print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                        difference,NONGENE_GTG_MAX_DIFFERENCE),
                    print thisInwpCBG.get_organism_nodes(target)[0]
                ################################################################
            elif thisInwpCBG.get_cexpander_uniformly_aligned_count() == 0:
                gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                ################################################################
                if verbose:
                    print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                        difference,NONGENE_GTG_MAX_DIFFERENCE),
                    print thisInwpCBG.get_organism_nodes(target)[0]
                ################################################################
            else:
                # cexpander check is succesfull, GTGdifference claims
                # the aligment is bogus. Do a more elaborate check on
                # some other variables of thisInwpCBG

                # calculate the difference between minsr & maxsr lengths
                node      = thisInwpCBG.get_organism_nodes(target)[0]
                minsr     = thisInwpCBG.minimal_spanning_range_sizes()[node]
                maxsr     = thisInwpCBG.maximal_spanning_range_sizes()[node]
                msr_ratio = float(minsr)/float(maxsr)

                # calculate the ratio between average weights of gtg and GTG
                average_wt_gtg = _pairwise_gtg_average_weight(gtg,target)
                average_wt_GTG = _pairwise_gtg_average_weight(GTG,target)
                gtg_ratio = average_wt_gtg / average_wt_GTG

                if msr_ratio < NONGENE_GTG_MAX_MSR_RATIO and\
                gtg_ratio < NONGENE_GTG_MAX_GTG_RATIO:
                    gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                    ################################################################
                    if verbose:
                        print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                            difference,NONGENE_GTG_MAX_DIFFERENCE),
                        print thisInwpCBG.get_organism_nodes(target)[0]
                    ################################################################
                else:
                    pass
        else:
            pass


    # return the gtgdiscrepancy_inwpcbg_list
    return gtgdiscrepancy_inwpcbg_list
예제 #3
0
def assign_utrornongene5p_inwpcbgs(inwpcbgs,verbose=True):
    """ """
    # return empty list when no inwpcbgs applied
    if not inwpcbgs: return []

    # get most likely first & final inwpCBG pointer in inwpcbgs list
    posFirst,posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs)

    # return variable list
    noncoding_inwpcbg_list = []

    # get data of most likely first inwpCBG
    max_cntAnnot        = max([inwp.count_orfs_labeled_as_annotated_exon() for inwp in inwpcbgs ])
    firstInwpCBG        = inwpcbgs[posFirst]
    first_cnt_is_first  = firstInwpCBG.count_orfs_labeled_as_first_exon()
    first_identityscore = firstInwpCBG.get_identityscore()
    first_upstrTSScnt   = [ pf.has_upstream_tss() for pf in firstInwpCBG.pacbps.values() ].count(True)
    if (max_cntAnnot-1) == 0:
        # avoid ZeroDivisionError
        first_upstrTSSratio = 0.0
    else:
        first_upstrTSSratio = float(first_upstrTSScnt) / (max_cntAnnot-1)

    # range of inwpCBGs which are checked for deletion
    range_5p_test = range(0,posFirst)

    # detect UTR or nongene / noncoding inwpCBGS
    for pos in range(0,len(inwpcbgs)):
        if pos not in range_5p_test: continue

        # get this inwpCBG and get statistics
        inwpCBG = inwpcbgs[pos]
        cntFirst = inwpCBG.count_orfs_labeled_as_first_exon()
        # calculated differently as for *the* firstCBG
        cntAnnot = float(inwpCBG.organism_set_size())

        # break when to putatively first is reached
        if cntFirst == first_cnt_is_first: break

        # do is_coding() test
        iscoding = inwpCBG.is_coding()
        # calculate cnt/ratio of upstrTSS sites
        this_upstrTSScnt   = [ pf.has_upstream_tss() for pf in inwpCBG.pacbps.values() ].count(True)
        if (max_cntAnnot-1) == 0:
            this_upstrTSSratio = 0.0
        else:
            this_upstrTSSratio = float(this_upstrTSScnt) / (max_cntAnnot-1.0)

        ########################################################################
        if verbose:
            print pos, range_5p_test, "5'UTR analyses:", inwpCBG, iscoding,
            print "coverage:",cntAnnot/max_cntAnnot,
            print "upstrTSS cnt - ratio: %s - %1.2f" % (
                    this_upstrTSScnt, this_upstrTSSratio)
        ########################################################################

        if not iscoding:
            # inwpCBGs most likely not coding alignments -> remove
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

        # check relative position towards the current firstInwpCBG
        # position is measured in actual nt distance and tcode 'distance':
        # the lowest scoring TCODE window in between these inwpCBGs
        tcodedistdict = inwpCBG.tcode_spacing_between_codingblocks([firstInwpCBG])

        if len(tcodedistdict) >= 3:
            _tmp = [ (v,k) for k,v in tcodedistdict.iteritems() ]
            _tmp.sort()
            del( tcodedistdict[_tmp[0][1]] )
            del( tcodedistdict[_tmp[-1][1]] )


        ########################################################################
        if verbose and len(tcodedistdict) >= 1:
            print pos, sum(tcodedistdict.values())/len(tcodedistdict),
            print TCODE_MAX_NONCODING, firstInwpCBG
        ########################################################################

        # continue when coverage is to high
        if cntAnnot/max_cntAnnot >= 0.40: continue

        if len(tcodedistdict)>0 and\
        sum(tcodedistdict.values())/len(tcodedistdict) <= TCODE_MAX_NONCODING:
            noncoding_inwpcbg_list.append(inwpCBG)
            continue


        if this_upstrTSScnt == 0:
            # no upstream TSS sites at all -> remove!
            noncoding_inwpcbg_list.append(inwpCBG)
            continue
    
        # do a furter check for unlikely first inwpCBG blocks
        if inwpCBG.get_identityscore() / first_identityscore <=\
        NONCODINGNONGENE_5p_INWPCBG_MAX_IDENTITYRATIO and\
        inwpCBG.organism_set_size() < first_cnt_is_first and\
        first_upstrTSSratio==0.0:
            noncoding_inwpcbg_list.append(inwpCBG)
            continue


        # do a furter check for unlikely first inwpCBG blocks
        if inwpCBG.get_identityscore() / first_identityscore <=\
        NONCODINGNONGENE_5p_INWPCBG_MAX_IDENTITYRATIO and\
        inwpCBG.organism_set_size() < first_cnt_is_first and\
        (first_upstrTSSratio!=0.0 and (this_upstrTSSratio / first_upstrTSSratio) < 0.6):
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

        # do a final check for unlikely first inwpCBG blocks
        # all parameters must be (slightly) poorer
        if inwpCBG.get_identityscore() < first_identityscore and\
        inwpCBG.organism_set_size() < first_cnt_is_first and\
        inwpCBG.get_average_upstream_methionine_pssm_score() <\
        firstInwpCBG.get_average_upstream_methionine_pssm_score():
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

    # return the noncoding_inwpcbg_list
    return noncoding_inwpcbg_list
예제 #4
0
def assign_internal_nongene_alignments(inwpcbgs,GTG,exclude_annotated=False,verbose=True):
    """
    TODO TODO: this function must be moved to another location.
    TODO TODO: better place in inwpCBGs/blocks filtering
    """ 
    # return empty list when no inwpcbgs applied
    if not inwpcbgs: return []

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # return list with inwpcbgs
    gtgdiscrepancy_inwpcbg_list = []

    # get most likely first & final inwpCBG pointer in inwpcbgs list
    posFirst,posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs)

    # check if posFirst,posFinal+1 isa non-empty range
    if not range(posFirst,posFinal+1): return [] 

    # get info on the *best* covered inwpCBG
    best_nt_identity = max([ inwpcbgs[pos].get_nt_identity() for pos in range(posFirst,posFinal+1) ])
    best_bitscore    = max([ inwpcbgs[pos].get_bitscore() for pos in range(posFirst,posFinal+1) ])
    best_annot_cnt   = max([ inwpcbgs[pos].count_orfs_labeled_as_annotated_exon() for pos in range(posFirst,posFinal+1) ])
    best_bits_per_aa = max([ float( inwpcbgs[pos].get_bitscore() ) / float( sum([pf.get_unextended_length() for pf in inwpcbgs[pos].pacbps.values() ]) ) for pos in range(posFirst,posFinal+1) ])

    for pos in range(posFirst,posFinal+1):
        # get this inwpCBG and 
        thisInwpCBG = inwpcbgs[pos]
        if pos > 0: prevInwpCBG = inwpcbgs[pos-1]
        else:       prevInwpCBG = None
        if pos < len(inwpcbgs)-1: nextInwpCBG = inwpcbgs[pos+1]
        else:                     nextInwpCBG = None

        if prevInwpCBG and Set(prevInwpCBG.get_nodes()).intersection(thisInwpCBG.get_nodes()):
            continue
        if nextInwpCBG and Set(nextInwpCBG.get_nodes()).intersection(thisInwpCBG.get_nodes()):
            continue

        tot_length  = sum([pf.get_unextended_length() for pf in thisInwpCBG.pacbps.values() ])
        bits        = thisInwpCBG.get_bitscore()
        bits_per_aa = float(bits)/float(tot_length)


        if bits_per_aa/best_bits_per_aa < 0.55 and\
        float(thisInwpCBG.count_orfs_labeled_as_annotated_exon()) /\
        float(best_annot_cnt) <= 0.50:
            minsr = thisInwpCBG.minimal_spanning_range(organism=target)
            print " __XX__", pos, thisInwpCBG, thisInwpCBG.get_nt_identity(), bits, float(bits)/float(tot_length), thisInwpCBG.count_orfs_labeled_as_annotated_exon()
            if prevInwpCBG and len(prevInwpCBG.maximal_spanning_range(organism=target).intersection(minsr)) == len(minsr): 
                ################################################################
                if verbose:
                    print "PREV::", len(prevInwpCBG.maximal_spanning_range(organism=target).intersection(minsr)), len(minsr)
                ################################################################
                gtgdiscrepancy_inwpcbg_list.append( thisInwpCBG )
                continue
            if nextInwpCBG and len(nextInwpCBG.maximal_spanning_range(organism=target).intersection(minsr)) == len(minsr):
                ################################################################
                if verbose:
                    print "NEXT::", len(nextInwpCBG.maximal_spanning_range(organism=target).intersection(minsr)), len(minsr)
                ################################################################
                gtgdiscrepancy_inwpcbg_list.append( thisInwpCBG )
                continue

    # return list with conflicts
    return gtgdiscrepancy_inwpcbg_list
예제 #5
0
def assign_utrornongene3p_inwpcbgs(inwpcbgs, verbose=True):
    """ """
    # return empty list when no inwpcbgs applied
    if not inwpcbgs: return []

    # get most likely first & final inwpCBG pointer in inwpcbgs list
    posFirst, posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs)

    # return variable list
    noncoding_inwpcbg_list = []

    # get data of most likely first inwpCBG
    max_cntAnnot = max(
        [inwp.count_orfs_labeled_as_annotated_exon() for inwp in inwpcbgs])
    finalInwpCBG = inwpcbgs[posFinal]
    final_cnt_is_final = finalInwpCBG.count_orfs_labeled_as_final_exon()
    final_identityscore = finalInwpCBG.get_identityscore()
    final_prjtls_aadif = finalInwpCBG.get_projected_tailing_stop_aa_difference(
    )
    final_prjtls_nonad = finalInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference(
    )

    # range of inwpCBGs which are checked for deletion
    range_3p_test = range(posFinal + 1, len(inwpcbgs))

    # detect UTR or nongene / noncoding inwpCBGS
    for pos in range(0, len(inwpcbgs)):
        if pos not in range_3p_test: continue

        # get this inwpCBG and get statistics
        inwpCBG = inwpcbgs[pos]
        cntFinal = inwpCBG.count_orfs_labeled_as_first_exon()
        # calculated differntly as for *the* firstCBG
        cntAnnot = float(inwpCBG.organism_set_size())

        # break when to putatively first is reached
        if cntFinal == final_cnt_is_final: break

        # remove poorly covered inwpCBGs with low identityscore and not
        # having a likely stop codon
        if cntAnnot/max_cntAnnot < 0.80 and\
        inwpCBG.get_identityscore() / final_identityscore <=\
        NONCODINGNONGENE_3p_INWPCBG_MAX_IDENTITYRATIO and\
        inwpCBG.organism_set_size() < final_cnt_is_final and\
        final_prjtls_aadif < inwpCBG.get_projected_tailing_stop_aa_difference() and\
        final_prjtls_nonad < inwpCBG.get_projected_tailing_stop_nonaligned_aa_difference():
            # not contribution to the gene structure at all....
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

        # check relative position towards the current finalInwpCBG
        # position is measured in actual nt distance and tcode 'distance':
        # the lowest scoring TCODE window in between these inwpCBGs
        ntdistdict = finalInwpCBG.nt_spacing_between_codingblocks([inwpCBG])
        tcodedistdict = finalInwpCBG.tcode_spacing_between_codingblocks(
            [inwpCBG])

        # remove highest & lowest distance and then do stats on remaining dists
        if len(ntdistdict) >= 3:
            _tmp = [(v, k) for k, v in ntdistdict.iteritems()]
            _tmp.sort()
            del (ntdistdict[_tmp[0][1]])
            del (ntdistdict[_tmp[-1][1]])
        if len(tcodedistdict) >= 3:
            _tmp = [(v, k) for k, v in tcodedistdict.iteritems()]
            _tmp.sort()
            del (tcodedistdict[_tmp[0][1]])
            del (tcodedistdict[_tmp[-1][1]])

        # do 3 checks.
        # 1) are average and maximum intergenic distances are bridged?
        # 2) is stop codon projection a deterioration?
        # 4) does tcodedistance suggest bridging of a non-coding stretch?
        check_1 = sum(ntdistdict.values())/len(ntdistdict) >=\
                  AVERAGE_INTERGENIC_MIN_NT_LENGTH and\
                  max(ntdistdict.values()) >= MAX_INTERGENIC_MIN_NT_LENGTH
        check_2 = final_prjtls_aadif <\
                  inwpCBG.get_projected_tailing_stop_aa_difference() and\
                  final_prjtls_nonad <\
                  inwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()
        check_3 = len(tcodedistdict)>0 and\
                  sum(tcodedistdict.values())/len(tcodedistdict) <=\
                  TCODE_MAX_NONCODING

        if [check_1, check_2, check_3].count(True) >= 2:
            # not contribution to the gene structure at all....
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

        # do is_coding() test
        iscoding = inwpCBG.is_coding()

        ########################################################################
        if verbose:
            print pos, "3'UTR analyses:", inwpCBG, iscoding,
            print cntAnnot / max_cntAnnot
        ########################################################################

        if not iscoding:
            # probably non-coding inwp CBG alignment block
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

    # return the noncoding_inwpcbg_list
    return noncoding_inwpcbg_list
예제 #6
0
def assign_utrornongene5p_inwpcbgs(inwpcbgs, verbose=True):
    """ """
    # return empty list when no inwpcbgs applied
    if not inwpcbgs: return []

    # get most likely first & final inwpCBG pointer in inwpcbgs list
    posFirst, posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs)

    # return variable list
    noncoding_inwpcbg_list = []

    # get data of most likely first inwpCBG
    max_cntAnnot = max(
        [inwp.count_orfs_labeled_as_annotated_exon() for inwp in inwpcbgs])
    firstInwpCBG = inwpcbgs[posFirst]
    first_cnt_is_first = firstInwpCBG.count_orfs_labeled_as_first_exon()
    first_identityscore = firstInwpCBG.get_identityscore()
    first_upstrTSScnt = [
        pf.has_upstream_tss() for pf in firstInwpCBG.pacbps.values()
    ].count(True)
    if (max_cntAnnot - 1) == 0:
        # avoid ZeroDivisionError
        first_upstrTSSratio = 0.0
    else:
        first_upstrTSSratio = float(first_upstrTSScnt) / (max_cntAnnot - 1)

    # range of inwpCBGs which are checked for deletion
    range_5p_test = range(0, posFirst)

    # detect UTR or nongene / noncoding inwpCBGS
    for pos in range(0, len(inwpcbgs)):
        if pos not in range_5p_test: continue

        # get this inwpCBG and get statistics
        inwpCBG = inwpcbgs[pos]
        cntFirst = inwpCBG.count_orfs_labeled_as_first_exon()
        # calculated differently as for *the* firstCBG
        cntAnnot = float(inwpCBG.organism_set_size())

        # break when to putatively first is reached
        if cntFirst == first_cnt_is_first: break

        # do is_coding() test
        iscoding = inwpCBG.is_coding()
        # calculate cnt/ratio of upstrTSS sites
        this_upstrTSScnt = [
            pf.has_upstream_tss() for pf in inwpCBG.pacbps.values()
        ].count(True)
        if (max_cntAnnot - 1) == 0:
            this_upstrTSSratio = 0.0
        else:
            this_upstrTSSratio = float(this_upstrTSScnt) / (max_cntAnnot - 1.0)

        ########################################################################
        if verbose:
            print pos, range_5p_test, "5'UTR analyses:", inwpCBG, iscoding,
            print "coverage:", cntAnnot / max_cntAnnot,
            print "upstrTSS cnt - ratio: %s - %1.2f" % (this_upstrTSScnt,
                                                        this_upstrTSSratio)
        ########################################################################

        if not iscoding:
            # inwpCBGs most likely not coding alignments -> remove
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

        # check relative position towards the current firstInwpCBG
        # position is measured in actual nt distance and tcode 'distance':
        # the lowest scoring TCODE window in between these inwpCBGs
        tcodedistdict = inwpCBG.tcode_spacing_between_codingblocks(
            [firstInwpCBG])

        if len(tcodedistdict) >= 3:
            _tmp = [(v, k) for k, v in tcodedistdict.iteritems()]
            _tmp.sort()
            del (tcodedistdict[_tmp[0][1]])
            del (tcodedistdict[_tmp[-1][1]])

        ########################################################################
        if verbose and len(tcodedistdict) >= 1:
            print pos, sum(tcodedistdict.values()) / len(tcodedistdict),
            print TCODE_MAX_NONCODING, firstInwpCBG
        ########################################################################

        # continue when coverage is to high
        if cntAnnot / max_cntAnnot >= 0.40: continue

        if len(tcodedistdict)>0 and\
        sum(tcodedistdict.values())/len(tcodedistdict) <= TCODE_MAX_NONCODING:
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

        if this_upstrTSScnt == 0:
            # no upstream TSS sites at all -> remove!
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

        # do a furter check for unlikely first inwpCBG blocks
        if inwpCBG.get_identityscore() / first_identityscore <=\
        NONCODINGNONGENE_5p_INWPCBG_MAX_IDENTITYRATIO and\
        inwpCBG.organism_set_size() < first_cnt_is_first and\
        first_upstrTSSratio==0.0:
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

        # do a furter check for unlikely first inwpCBG blocks
        if inwpCBG.get_identityscore() / first_identityscore <=\
        NONCODINGNONGENE_5p_INWPCBG_MAX_IDENTITYRATIO and\
        inwpCBG.organism_set_size() < first_cnt_is_first and\
        (first_upstrTSSratio!=0.0 and (this_upstrTSSratio / first_upstrTSSratio) < 0.6):
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

        # do a final check for unlikely first inwpCBG blocks
        # all parameters must be (slightly) poorer
        if inwpCBG.get_identityscore() < first_identityscore and\
        inwpCBG.organism_set_size() < first_cnt_is_first and\
        inwpCBG.get_average_upstream_methionine_pssm_score() <\
        firstInwpCBG.get_average_upstream_methionine_pssm_score():
            noncoding_inwpcbg_list.append(inwpCBG)
            continue

    # return the noncoding_inwpcbg_list
    return noncoding_inwpcbg_list
예제 #7
0
def assign_gtgdiscrepancy_inwpcbgs(inwpcbgs,
                                   GTG,
                                   exclude_annotated=True,
                                   verbose=True):
    """ """
    # return empty list when no inwpcbgs applied
    if not inwpcbgs: return []

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # return list with inwpcbgs
    gtgdiscrepancy_inwpcbg_list = []

    if exclude_annotated:
        # get most likely first & final inwpCBG pointer in inwpcbgs list
        posFirst, posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs)
        range_5p_test = range(0, posFirst)
        range_3p_test = range(posFinal + 1, len(inwpcbgs))
        protected_target_orfid_list = []
        for inwpCBG in inwpcbgs[posFirst:posFinal + 1]:
            if inwpCBG.count_orfs_labeled_as_annotated_exon() > 0:
                protected_target_orfid_list.append(
                    inwpCBG.get_orfs_of_graph(organism=target)[0].id)
    else:
        range_5p_test = []
        range_3p_test = []
        protected_target_orfid_list = []

    ############################################################################
    if verbose and exclude_annotated:
        print "NOT-excluded:", range_5p_test, range_3p_test
    ############################################################################

    # detect UTR or nongene / noncoding inwpCBGS
    for pos in range(0, len(inwpcbgs)):
        if exclude_annotated and pos in range_5p_test:
            pass
        elif exclude_annotated and pos in range_3p_test:
            pass
        elif exclude_annotated and inwpcbgs[
                pos].count_orfs_labeled_as_annotated_exon() == 0:
            # in the middle of the annotated geen structure, but not a single
            # Orf annotated as an exon. Asses for gtg difference too!
            pass
        elif exclude_annotated:
            continue
        else:
            pass

        # get this inwpCBG and
        thisInwpCBG = inwpcbgs[pos]

        # ignore if the target's Orf is belonging to a `protected` Orf
        if protected_target_orfid_list and\
        thisInwpCBG.get_orfs_of_graph(organism=target)[0].id in\
        protected_target_orfid_list:
            continue

        # ignore inwpCBGs which are very likely (poor quality) SignalP alignments
        cntSP = float(thisInwpCBG.count_orfs_with_signalpeptides())
        if cntSP/(thisInwpCBG.count_genomic_informants()+1) > 0.66 and\
        thisInwpCBG.get_signalp_score() > 0.75:
            continue

        # create its GeneTreeGraph
        gtg = pcg2gtg_by_identity(thisInwpCBG, target)

        # step 1. Do the gtg/GTG difference check
        difference = _relative_gtg_difference(gtg, GTG, target)

        if difference < NONGENE_GTG_MAX_DIFFERENCE:
            # step 2. Do the CEXPANDER check
            if thisInwpCBG.node_count() <= 2:
                gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                ################################################################
                if verbose:
                    print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                        difference, NONGENE_GTG_MAX_DIFFERENCE),
                    print thisInwpCBG.get_organism_nodes(target)[0]
                ################################################################
            elif thisInwpCBG.get_cexpander_uniformly_aligned_count() == 0:
                gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                ################################################################
                if verbose:
                    print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                        difference, NONGENE_GTG_MAX_DIFFERENCE),
                    print thisInwpCBG.get_organism_nodes(target)[0]
                ################################################################
            else:
                # cexpander check is succesfull, GTGdifference claims
                # the aligment is bogus. Do a more elaborate check on
                # some other variables of thisInwpCBG

                # calculate the difference between minsr & maxsr lengths
                node = thisInwpCBG.get_organism_nodes(target)[0]
                minsr = thisInwpCBG.minimal_spanning_range_sizes()[node]
                maxsr = thisInwpCBG.maximal_spanning_range_sizes()[node]
                msr_ratio = float(minsr) / float(maxsr)

                # calculate the ratio between average weights of gtg and GTG
                average_wt_gtg = _pairwise_gtg_average_weight(gtg, target)
                average_wt_GTG = _pairwise_gtg_average_weight(GTG, target)
                gtg_ratio = average_wt_gtg / average_wt_GTG

                if msr_ratio < NONGENE_GTG_MAX_MSR_RATIO and\
                gtg_ratio < NONGENE_GTG_MAX_GTG_RATIO:
                    gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                    ################################################################
                    if verbose:
                        print pos, "thisInwpCBG", "gtg2GTGdiff:: %1.3f < %1.3f" % (
                            difference, NONGENE_GTG_MAX_DIFFERENCE),
                        print thisInwpCBG.get_organism_nodes(target)[0]
                    ################################################################
                else:
                    pass
        else:
            pass

    # return the gtgdiscrepancy_inwpcbg_list
    return gtgdiscrepancy_inwpcbg_list
예제 #8
0
def assign_internal_nongene_alignments(inwpcbgs,
                                       GTG,
                                       exclude_annotated=False,
                                       verbose=True):
    """
    TODO TODO: this function must be moved to another location.
    TODO TODO: better place in inwpCBGs/blocks filtering
    """
    # return empty list when no inwpcbgs applied
    if not inwpcbgs: return []

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # return list with inwpcbgs
    gtgdiscrepancy_inwpcbg_list = []

    # get most likely first & final inwpCBG pointer in inwpcbgs list
    posFirst, posFinal = get_first_and_final_inwpcbg_pos(inwpcbgs)

    # check if posFirst,posFinal+1 isa non-empty range
    if not range(posFirst, posFinal + 1): return []

    # get info on the *best* covered inwpCBG
    best_nt_identity = max([
        inwpcbgs[pos].get_nt_identity()
        for pos in range(posFirst, posFinal + 1)
    ])
    best_bitscore = max([
        inwpcbgs[pos].get_bitscore() for pos in range(posFirst, posFinal + 1)
    ])
    best_annot_cnt = max([
        inwpcbgs[pos].count_orfs_labeled_as_annotated_exon()
        for pos in range(posFirst, posFinal + 1)
    ])
    best_bits_per_aa = max([
        float(inwpcbgs[pos].get_bitscore()) / float(
            sum([
                pf.get_unextended_length()
                for pf in inwpcbgs[pos].pacbps.values()
            ])) for pos in range(posFirst, posFinal + 1)
    ])

    for pos in range(posFirst, posFinal + 1):
        # get this inwpCBG and
        thisInwpCBG = inwpcbgs[pos]
        if pos > 0: prevInwpCBG = inwpcbgs[pos - 1]
        else: prevInwpCBG = None
        if pos < len(inwpcbgs) - 1: nextInwpCBG = inwpcbgs[pos + 1]
        else: nextInwpCBG = None

        if prevInwpCBG and Set(prevInwpCBG.get_nodes()).intersection(
                thisInwpCBG.get_nodes()):
            continue
        if nextInwpCBG and Set(nextInwpCBG.get_nodes()).intersection(
                thisInwpCBG.get_nodes()):
            continue

        tot_length = sum(
            [pf.get_unextended_length() for pf in thisInwpCBG.pacbps.values()])
        bits = thisInwpCBG.get_bitscore()
        bits_per_aa = float(bits) / float(tot_length)


        if bits_per_aa/best_bits_per_aa < 0.55 and\
        float(thisInwpCBG.count_orfs_labeled_as_annotated_exon()) /\
        float(best_annot_cnt) <= 0.50:
            minsr = thisInwpCBG.minimal_spanning_range(organism=target)
            print " __XX__", pos, thisInwpCBG, thisInwpCBG.get_nt_identity(
            ), bits, float(bits) / float(
                tot_length), thisInwpCBG.count_orfs_labeled_as_annotated_exon(
                )
            if prevInwpCBG and len(
                    prevInwpCBG.maximal_spanning_range(
                        organism=target).intersection(minsr)) == len(minsr):
                ################################################################
                if verbose:
                    print "PREV::", len(
                        prevInwpCBG.maximal_spanning_range(
                            organism=target).intersection(minsr)), len(minsr)
                ################################################################
                gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                continue
            if nextInwpCBG and len(
                    nextInwpCBG.maximal_spanning_range(
                        organism=target).intersection(minsr)) == len(minsr):
                ################################################################
                if verbose:
                    print "NEXT::", len(
                        nextInwpCBG.maximal_spanning_range(
                            organism=target).intersection(minsr)), len(minsr)
                ################################################################
                gtgdiscrepancy_inwpcbg_list.append(thisInwpCBG)
                continue

    # return list with conflicts
    return gtgdiscrepancy_inwpcbg_list