def detect_and_remove_single_nonfinal_inwpcbg(inwpcbgs,PCG,GENE_IDENTIFIER_SET,
    verbose=False):
    """
    Allow deletion of a very shitty, single inwpCBG from the end of the list
    """
    # we need at least 2 inwpCBGs in order to remove one of them
    if len(inwpcbgs) <= 1: return False

    lastInwpCBG = inwpcbgs[-1]
    prevInwpCBG = inwpcbgs[-2]

    lastNodeList = [ lastInwpCBG.get_organism_nodes(org)[0] for org in\
                lastInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]
    prevNodeList = [ prevInwpCBG.get_organism_nodes(org)[0] for org in\
                prevInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]

    # identical nodes -> do not delete. Only go for very obvious things
    if Set(lastNodeList).intersection(prevNodeList): return False

    ntdistdict    = prevInwpCBG.nt_spacing_between_codingblocks([lastInwpCBG])
    tcodedistdict = prevInwpCBG.tcode_spacing_between_codingblocks([lastInwpCBG])

    check1 = prevInwpCBG.count_orfs_labeled_as_annotated_exon() >\
             lastInwpCBG.count_orfs_labeled_as_annotated_exon() 
    check2 = prevInwpCBG.get_bitscore() > lastInwpCBG.get_bitscore() 
    check3 = len(prevNodeList) > len(lastNodeList)
    check4 = float(lastInwpCBG.count_orfs_labeled_as_annotated_exon()) /\
             float(len(GENE_IDENTIFIER_SET)) <= 0.33
    if ntdistdict:
        check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\
             MIN_INTERGENIC_NT_LENGTH
    else:
        check5 = False
    if tcodedistdict:
        check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\
             TCODE_MAX_NONCODING
    else:
        check6 = False
    check7 = prevInwpCBG.get_projected_tailing_stop_aa_difference() <\
             lastInwpCBG.get_projected_tailing_stop_aa_difference()
    check8 = prevInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()<\
             lastInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()

    checklist = [check1,check2,check3,check4,check5,check6,check7,check8]

    ############################################################################
    if verbose: print "NonFinal inwpCBG check:", checklist
    ############################################################################
   
    if checklist.count(False) == 0:
        nonfinalPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
        # place all PacbPORFs in the nonfinalPCG
        for (pacbpkey,nodeQ,nodeS), pacbporf in lastInwpCBG.pacbps.iteritems():
            # add to noncodingnongenePCG
            nonfinalPCG.add_node(nodeQ)
            nonfinalPCG.add_node(nodeS)
            nonfinalPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
            nonfinalPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
            # remove from main PCG
            _delete_pacbp(PCG,(pacbpkey,nodeQ,nodeS))
        # return nonfinalPCG
        return nonfinalPCG
    else:
        return False
def detect_and_remove_gtgdiscrepancy(inwpcbgs,PCG,GENE_IDENTIFIER_SET,verbose=True):
    """ """

    # if empty list or empty PCG provided: return False
    if not inwpcbgs or not PCG or PCG.node_count() == 0: return False

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # Make *the* GTG of the strongest X informant species
    # X depends on the maximum number of gene informants (GENE_IDENTIFIER_SET);
    # unigene informants are not taken into account here.
    # X is defined here by:
    # -- at least 3 informants (for very small number of informants)
    # -- optimally half of the total numers of informants
    # -- at most 8 informants
    min_gtg_node_count = 3 + 1
    max_gtg_node_count = 8 + 1
    gtg_size = min([(len(GENE_IDENTIFIER_SET)-1)/2, max_gtg_node_count])
    gtg_size = max([min_gtg_node_count,gtg_size])

    btGTG = pcg2gtg_by_bitscore(PCG,target,identifier_list=GENE_IDENTIFIER_SET)
    ntGTG = pcg2gtg_by_identity(PCG,target,identifier_list=GENE_IDENTIFIER_SET)

    # TEMP solution because OrganismGraph != OrganismStarGraph
    # make bitscore ordered list of nodes
    bitscore_ordered_nodes = []
    for (tNode,iNode),wt in btGTG.weights.iteritems():
        if tNode==target: bitscore_ordered_nodes.append( ( wt, iNode ) )
    bitscore_ordered_nodes.sort() 
    #if verbose: print "btGTG::", bitscore_ordered_nodes

    while ntGTG.node_count() > gtg_size:
        # next line causes errors because OrganismGraph != OrganismStarGraph
        # this causes the target node in rare cases to be assigned as the weakest node
        # informant = btGTG.weakest_connected_node()
        (wt,informant) = bitscore_ordered_nodes.pop(0)
        btGTG.del_node(informant)
        ntGTG.del_node(informant)
        if verbose: print "btGGT.weakest_connected_node() ==", informant, btGTG.get_ordered_nodes()

    ############################################################################
    if verbose:
        print "ntGTG:", ntGTG.get_ordered_nodes(), 
        for node in ntGTG.get_ordered_nodes():
            if node == target: continue
            print "%1.2f" % ntGTG.weights[(target,node)],
        print ""
    ############################################################################

    # detect inwpCBGs which are probably the result of intron alignments
    gtgdiscrepancy_internal_inwpcbg_list = assign_internal_nongene_alignments(inwpcbgs,ntGTG)


    # detect inwpCBGs with strong discrepancy to this GTG
    gtgdiscrepancy_inwpcbg_list = assign_gtgdiscrepancy_inwpcbgs(inwpcbgs,ntGTG)

    # merge both lists
    if gtgdiscrepancy_internal_inwpcbg_list:
        if not gtgdiscrepancy_inwpcbg_list:
            gtgdiscrepancy_inwpcbg_list.extend(gtgdiscrepancy_internal_inwpcbg_list)
        else:
            for inwpcbg in gtgdiscrepancy_internal_inwpcbg_list:
                check_str = str(inwpcbg)
                if check_str not in [ str(gtgdiscrCBG) for gtgdiscrCBG in gtgdiscrepancy_inwpcbg_list ]:
                    gtgdiscrepancy_inwpcbg_list.append( inwpcbg )

    if not gtgdiscrepancy_inwpcbg_list:
        return False

    # get list of inwpCBGs that have NO discrepancy
    correct_inwpcbg_list = []
    check_str_list = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        check_str_list.append( str(discrinwpCBG) )
    for inwpcbg in inwpcbgs:
        if str(inwpcbg) not in check_str_list:
            correct_inwpcbg_list.append( inwpcbg )

    # get all pacbp keys belonging to gtgdiscrepancy inwpcbgs ONLY
    gtgdiscrepancy_pacbpkeys = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        for pacbpkey in discrinwpCBG.pacbps.keys():
            # check if this pacbpkey is occuring in a non-removed inwpCBG
            is_occurring_in_correct_inwpcbg = False
            for inwp in correct_inwpcbg_list:
                if pacbpkey in inwp.pacbps.keys():
                    is_occurring_in_correct_inwpcbg = True
                    break
            # if is_occurring_in_correct_inwpcbg, continue and do not delete
            if is_occurring_in_correct_inwpcbg:
                continue
            # store to gtgdiscrepancy_pacbpkeys when not stored already
            if pacbpkey not in gtgdiscrepancy_pacbpkeys:
                gtgdiscrepancy_pacbpkeys.append(pacbpkey)


    # place all gtgdiscrepancy_pacbpkeys and PacbPORFs in the gtgdiscrepancyPCG
    # and, at the same time, remove from the main PCG
    gtgdiscrepancyPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
    for key in gtgdiscrepancy_pacbpkeys:
        if key not in PCG.pacbps.keys():
            # !?!? TODO why not present in the PCG !?!?!
            # anyway, continue here to avoid KeyError
            # This PacbPORF was to be deleted rigth here,
            # so it is not an extreme disaster. But... scary ;-)
            continue
        (pacbpkey,nodeQ,nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to gtgdiscrepancyPCG
        gtgdiscrepancyPCG.add_node(nodeQ)
        gtgdiscrepancyPCG.add_node(nodeS)
        gtgdiscrepancyPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        gtgdiscrepancyPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf

        # remove from main PCG
        _delete_pacbp(PCG,key)


    # return gtgdiscrepancyPCG
    return gtgdiscrepancyPCG
def detect_and_remove_utrornonegene_inwpcbgs(inwpcbgs,PCG,verbose=True):
    """ """

    # if empty list or empty PCG provided: return False
    if not inwpcbgs or not PCG or PCG.node_count() == 0: return False

    # MAKE SHURE ALL Orfs HAVE PREDICTED TSS SITES!!
    for inwpCBG in inwpcbgs: inwpCBG.scan_orfs_for_pssm_tss(min_pssm_score=TSS_MIN_PSSM_SCORE)

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # detect inwpCBGs which are most likely 5' and 3' non coding or non gene
    ncng_5p_list = assign_utrornongene5p_inwpcbgs(inwpcbgs)
    ncng_3p_list = assign_utrornongene3p_inwpcbgs(inwpcbgs)
    ncng_list = ncng_5p_list
    ncng_list.extend(ncng_3p_list)

    # return False in no inwpcbgs are assigned
    if not ncng_list: return False

    # get list of inwpCBGs that are NON ncng
    correct_inwpcbg_list = []
    check_str_list = []
    for discrinwpCBG in ncng_list:
        check_str_list.append( str(discrinwpCBG) )
    for inwpcbg in inwpcbgs:
        if str(inwpcbg) not in check_str_list:
            correct_inwpcbg_list.append( inwpcbg )

    # get all pacbp keys belonging to noncoding / nongene inwpcbgs ONLY
    ncng_pacbpkeys = []
    for ncnginwpCBG in ncng_list:
        for pacbpkey in ncnginwpCBG.pacbps.keys():
            # check if this pacbpkey is occuring in a non-removed inwpCBG
            is_occurring_in_correct_inwpcbg = False
            for inwp in correct_inwpcbg_list:
                if pacbpkey in inwp.pacbps.keys():
                    is_occurring_in_correct_inwpcbg = True
                    break
            # if is_occurring_in_correct_inwpcbg, continue and do not delete
            if is_occurring_in_correct_inwpcbg:
                continue
            # store to gtgdiscrepancy_pacbpkeys when not stored already
            if pacbpkey not in ncng_pacbpkeys:
                ncng_pacbpkeys.append(pacbpkey)


    # place all ncng_pacbpkeys and PacbPORFs in the noncodingnongenePCG
    # and, at the same time, remove from the main PCG
    noncodingnongenePCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
    for key in ncng_pacbpkeys:
        (pacbpkey,nodeQ,nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to noncodingnongenePCG
        noncodingnongenePCG.add_node(nodeQ)
        noncodingnongenePCG.add_node(nodeS)
        noncodingnongenePCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        noncodingnongenePCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
        # remove from main PCG
        _delete_pacbp(PCG,key)

    # return noncodingnongenePCG
    return noncodingnongenePCG
示例#4
0
def detect_and_remove_synteny(inwpcbgs,
                              PCG,
                              GENE_IDENTIFIER_SET,
                              verbose=True):
    """ """
    MIN_OBSERVED_VS_EXPECTED_RATIO = 0.20

    observed_organism_subcombis = []
    syntenic_subinwpcbgs = []

    # detect syntenic genes in MAIN inwpCBGs,
    # without taking strongest informants by GTG analyses
    syntenic_inwpcbgs = assign_syntenic_inwpcbgs(inwpcbgs)

    for syntinwpcbg in syntenic_inwpcbgs:
        syntenic_subinwpcbgs.append(syntinwpcbg)

    for inwpCBG in inwpcbgs:
        # omit inwpCBGs with annotated exons/orfs
        if inwpCBG.count_orfs_labeled_as_annotated_exon() >= 2: continue
        target = inwpCBG._get_target_organism()

        # make a (artificially fully connected) GeneTreeGraph
        gtg = GeneTreeGraph()
        gtg.add_node(target)
        for (pacbpkey, nodeQ, nodeS), pacbporf in inwpCBG.pacbps.iteritems():
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_node(orgS)
        for (pacbpkey, nodeQ, nodeS), pacbporf in inwpCBG.pacbps.iteritems():
            orgQ = inwpCBG.organism_by_node(nodeQ)
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_edge(orgQ, orgS, wt=pacbporf.bitscore)

            # make artificially missed edges between the informants
            for org in inwpCBG.organism_set():
                if org not in [orgQ, orgS] and org in GENE_IDENTIFIER_SET:
                    if gtg.has_edge( orgS, org ) and\
                    gtg.weights[(orgS, org)] > pacbporf.bitscore:
                        gtg.set_edge_weight(orgS, org, wt=pacbporf.bitscore)
                    else:
                        gtg.add_edge(orgS, org, wt=pacbporf.bitscore)

        # omit (nearly) empty genetreegraphs
        if gtg.node_count() <= 1: continue

        # remove (much) weaker connected nodes as expected from the gtg
        while gtg.get_nodes() and MIN_OBSERVED_VS_EXPECTED_RATIO >\
        min( [ gtg.get_node_weighted_connectivity_observed_vs_expected(node) for node in gtg.get_nodes() ]):
            node = gtg.weakest_connected_node()
            gtg.del_node(node)

        # check if already tested before; present in observed_organism_subcombis
        if gtg.get_ordered_nodes() in observed_organism_subcombis: continue

        # store to already tested organism subcombinations
        observed_organism_subcombis.append(gtg.get_ordered_nodes())

        # create a subPCG of these organisms
        subPCG = PacbpCollectionGraph(crossdata={},
                                      blastmatrix=PCG._blastmatrix)
        for (pacbpkey, nodeQ, nodeS), pacbporf in PCG.pacbps.iteritems():
            (orgQ, orfQid), (orgS, orfSid) = nodeQ, nodeS
            if orgQ not in gtg.get_nodes(): continue
            if orgS not in gtg.get_nodes(): continue
            subPCG.add_node(nodeQ)
            subPCG.add_node(nodeS)
            subPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
            subPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf

        # make inwpCBGs of this subPCG
        subinwpcbgs = PCG2inwpCBGS(subPCG)

        # check if there are subinwpcbgs
        if not subinwpcbgs: continue

        ########################################################################
        #if verbose:
        #    print "subPCG organism set:", gtg.get_ordered_nodes()
        #    print_inwpcbgstructure(subinwpcbgs,gtg.get_ordered_nodes())
        ########################################################################

        # create a subInwardsPointingCodingBlockGraph of these organisms
        #subinwpCBG = InwardsPointingCodingBlockGraph()
        #for (pacbpkey,nodeQ,nodeS), pacbporf in inwpCBG.pacbps.iteritems():
        #    (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS
        #    if orgQ not in gtg.get_nodes(): continue
        #    if orgS not in gtg.get_nodes(): continue
        #    subinwpCBG.add_node(nodeQ)
        #    subinwpCBG.add_node(nodeS)
        #    subinwpCBG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        #    subinwpCBG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf

        # detect syntenic genes in this subinwpcbgs
        syntenic_inwpcbgs = assign_syntenic_inwpcbgs(subinwpcbgs)

        for syntinwpcbg in syntenic_inwpcbgs:
            syntenic_subinwpcbgs.append(syntinwpcbg)
            ####################################################################
            if verbose:
                print "SYNTENIC!!", syntinwpcbg, syntinwpcbg.get_ordered_nodes(
                )
                for subCBG in subinwpcbgs:
                    print "syntenic in:", subCBG, subCBG.get_ordered_nodes()
            ####################################################################

    if not syntenic_subinwpcbgs:
        return False

    # cleanup all inwpCBGs from the syntenic subInwpCBGs
    syntenic_pacbpkeys = []
    for syntinwpcbg in syntenic_subinwpcbgs:
        node_set = syntinwpcbg.node_set()
        for inwpCBG in inwpcbgs:
            if not node_set.difference(inwpCBG.node_set()):
                for pacbpkey in inwpCBG.pacbps.keys():
                    if pacbpkey not in syntenic_pacbpkeys:
                        syntenic_pacbpkeys.append(pacbpkey)

    # place all syntenic_pacbpkeys and PacbPORFs in the syntenicPCG
    # and, at the same time, remove from the main PCG
    syntenicPCG = PacbpCollectionGraph(crossdata={},
                                       blastmatrix=PCG._blastmatrix)
    for key in syntenic_pacbpkeys:
        (pacbpkey, nodeQ, nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to syntenicPCG
        syntenicPCG.add_node(nodeQ)
        syntenicPCG.add_node(nodeS)
        syntenicPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
        syntenicPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf
        # remove from main PCG
        _delete_pacbp(PCG, key)

    # return syntenicPCG
    return syntenicPCG
def detect_and_remove_single_nonfirst_inwpcbg(inwpcbgs,PCG,GENE_IDENTIFIER_SET,
    verbose=False):
    """
    Allow deletion of a very shitty, single inwpCBG from the start of the list
    """
    # we need at least 2 inwpCBGs in order to remove one of them
    if len(inwpcbgs) <= 1: return False

    firstInwpCBG = inwpcbgs[0]
    nextInwpCBG = inwpcbgs[1]

    firstNodeList = [ firstInwpCBG.get_organism_nodes(org)[0] for org in\
                firstInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]
    nextNodeList = [ nextInwpCBG.get_organism_nodes(org)[0] for org in\
                nextInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]

    # identical nodes -> do not delete. Only go for very obvious things
    if Set(firstNodeList).intersection(nextNodeList): return False

    ntdistdict    = firstInwpCBG.nt_spacing_between_codingblocks([nextInwpCBG])
    tcodedistdict = firstInwpCBG.tcode_spacing_between_codingblocks([nextInwpCBG])

    # make a long list of checks which should be True in case
    # firstInwpCBG is *NOT* the first exon of this gene structure
    check1 = nextInwpCBG.count_orfs_labeled_as_annotated_exon() >\
             firstInwpCBG.count_orfs_labeled_as_annotated_exon() 
    check2 = nextInwpCBG.get_bitscore() > firstInwpCBG.get_bitscore() 
    check3 = len(nextNodeList) > len(firstNodeList)
    check4 = float(firstInwpCBG.count_orfs_labeled_as_annotated_exon()) /\
             float(len(GENE_IDENTIFIER_SET)) <= 0.33
    if ntdistdict:
        check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\
             MIN_INTERGENIC_NT_LENGTH
    else:
        check5 = False
    if tcodedistdict:
        check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\
             TCODE_MAX_NONCODING
    else:
        check6 = False
    check7 = nextInwpCBG.count_orfs_labeled_as_first_exon() >=\
             firstInwpCBG.count_orfs_labeled_as_first_exon()
    check8 = firstInwpCBG.count_orfs_labeled_as_annotated_exon() == 0 
    check9 = nextInwpCBG.get_average_upstream_methionine_pssm_score() >\
             firstInwpCBG.get_average_upstream_methionine_pssm_score() 

    checklist = [check1,check2,check3,check4,check5,check6,check7,check8,check9]

    ############################################################################
    if verbose or True: print "NonFirst inwpCBG check:", checklist
    ############################################################################
   
    if checklist.count(False) <= 1:
        nonfirstPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
        # place all PacbPORFs in the nonfirstPCG
        for (pacbpkey,nodeQ,nodeS), pacbporf in firstInwpCBG.pacbps.iteritems():
            # add to noncodingnongenePCG
            nonfirstPCG.add_node(nodeQ)
            nonfirstPCG.add_node(nodeS)
            nonfirstPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
            nonfirstPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
            # remove from main PCG
            _delete_pacbp(PCG,(pacbpkey,nodeQ,nodeS))
        # return nonfirstPCG
        return nonfirstPCG
    else:
        return False
def detect_and_remove_single_nonfinal_inwpcbg(inwpcbgs,
                                              PCG,
                                              GENE_IDENTIFIER_SET,
                                              verbose=False):
    """
    Allow deletion of a very shitty, single inwpCBG from the end of the list
    """
    # we need at least 2 inwpCBGs in order to remove one of them
    if len(inwpcbgs) <= 1: return False

    lastInwpCBG = inwpcbgs[-1]
    prevInwpCBG = inwpcbgs[-2]

    lastNodeList = [ lastInwpCBG.get_organism_nodes(org)[0] for org in\
                lastInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]
    prevNodeList = [ prevInwpCBG.get_organism_nodes(org)[0] for org in\
                prevInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]

    # identical nodes -> do not delete. Only go for very obvious things
    if Set(lastNodeList).intersection(prevNodeList): return False

    ntdistdict = prevInwpCBG.nt_spacing_between_codingblocks([lastInwpCBG])
    tcodedistdict = prevInwpCBG.tcode_spacing_between_codingblocks(
        [lastInwpCBG])

    check1 = prevInwpCBG.count_orfs_labeled_as_annotated_exon() >\
             lastInwpCBG.count_orfs_labeled_as_annotated_exon()
    check2 = prevInwpCBG.get_bitscore() > lastInwpCBG.get_bitscore()
    check3 = len(prevNodeList) > len(lastNodeList)
    check4 = float(lastInwpCBG.count_orfs_labeled_as_annotated_exon()) /\
             float(len(GENE_IDENTIFIER_SET)) <= 0.33
    if ntdistdict:
        check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\
             MIN_INTERGENIC_NT_LENGTH
    else:
        check5 = False
    if tcodedistdict:
        check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\
             TCODE_MAX_NONCODING
    else:
        check6 = False
    check7 = prevInwpCBG.get_projected_tailing_stop_aa_difference() <\
             lastInwpCBG.get_projected_tailing_stop_aa_difference()
    check8 = prevInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()<\
             lastInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()

    checklist = [
        check1, check2, check3, check4, check5, check6, check7, check8
    ]

    ############################################################################
    if verbose: print "NonFinal inwpCBG check:", checklist
    ############################################################################

    if checklist.count(False) == 0:
        nonfinalPCG = PacbpCollectionGraph(crossdata={},
                                           blastmatrix=PCG._blastmatrix)
        # place all PacbPORFs in the nonfinalPCG
        for (pacbpkey, nodeQ,
             nodeS), pacbporf in lastInwpCBG.pacbps.iteritems():
            # add to noncodingnongenePCG
            nonfinalPCG.add_node(nodeQ)
            nonfinalPCG.add_node(nodeS)
            nonfinalPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
            nonfinalPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf
            # remove from main PCG
            _delete_pacbp(PCG, (pacbpkey, nodeQ, nodeS))
        # return nonfinalPCG
        return nonfinalPCG
    else:
        return False
def detect_and_remove_utrornonegene_inwpcbgs(inwpcbgs, PCG, verbose=True):
    """ """

    # if empty list or empty PCG provided: return False
    if not inwpcbgs or not PCG or PCG.node_count() == 0: return False

    # MAKE SHURE ALL Orfs HAVE PREDICTED TSS SITES!!
    for inwpCBG in inwpcbgs:
        inwpCBG.scan_orfs_for_pssm_tss(min_pssm_score=TSS_MIN_PSSM_SCORE)

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # detect inwpCBGs which are most likely 5' and 3' non coding or non gene
    ncng_5p_list = assign_utrornongene5p_inwpcbgs(inwpcbgs)
    ncng_3p_list = assign_utrornongene3p_inwpcbgs(inwpcbgs)
    ncng_list = ncng_5p_list
    ncng_list.extend(ncng_3p_list)

    # return False in no inwpcbgs are assigned
    if not ncng_list: return False

    # get list of inwpCBGs that are NON ncng
    correct_inwpcbg_list = []
    check_str_list = []
    for discrinwpCBG in ncng_list:
        check_str_list.append(str(discrinwpCBG))
    for inwpcbg in inwpcbgs:
        if str(inwpcbg) not in check_str_list:
            correct_inwpcbg_list.append(inwpcbg)

    # get all pacbp keys belonging to noncoding / nongene inwpcbgs ONLY
    ncng_pacbpkeys = []
    for ncnginwpCBG in ncng_list:
        for pacbpkey in ncnginwpCBG.pacbps.keys():
            # check if this pacbpkey is occuring in a non-removed inwpCBG
            is_occurring_in_correct_inwpcbg = False
            for inwp in correct_inwpcbg_list:
                if pacbpkey in inwp.pacbps.keys():
                    is_occurring_in_correct_inwpcbg = True
                    break
            # if is_occurring_in_correct_inwpcbg, continue and do not delete
            if is_occurring_in_correct_inwpcbg:
                continue
            # store to gtgdiscrepancy_pacbpkeys when not stored already
            if pacbpkey not in ncng_pacbpkeys:
                ncng_pacbpkeys.append(pacbpkey)

    # place all ncng_pacbpkeys and PacbPORFs in the noncodingnongenePCG
    # and, at the same time, remove from the main PCG
    noncodingnongenePCG = PacbpCollectionGraph(crossdata={},
                                               blastmatrix=PCG._blastmatrix)
    for key in ncng_pacbpkeys:
        (pacbpkey, nodeQ, nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to noncodingnongenePCG
        noncodingnongenePCG.add_node(nodeQ)
        noncodingnongenePCG.add_node(nodeS)
        noncodingnongenePCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
        noncodingnongenePCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf
        # remove from main PCG
        _delete_pacbp(PCG, key)

    # return noncodingnongenePCG
    return noncodingnongenePCG
def detect_and_remove_gtgdiscrepancy(inwpcbgs,
                                     PCG,
                                     GENE_IDENTIFIER_SET,
                                     verbose=True):
    """ """

    # if empty list or empty PCG provided: return False
    if not inwpcbgs or not PCG or PCG.node_count() == 0: return False

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # Make *the* GTG of the strongest X informant species
    # X depends on the maximum number of gene informants (GENE_IDENTIFIER_SET);
    # unigene informants are not taken into account here.
    # X is defined here by:
    # -- at least 3 informants (for very small number of informants)
    # -- optimally half of the total numers of informants
    # -- at most 8 informants
    min_gtg_node_count = 3 + 1
    max_gtg_node_count = 8 + 1
    gtg_size = min([(len(GENE_IDENTIFIER_SET) - 1) / 2, max_gtg_node_count])
    gtg_size = max([min_gtg_node_count, gtg_size])

    btGTG = pcg2gtg_by_bitscore(PCG,
                                target,
                                identifier_list=GENE_IDENTIFIER_SET)
    ntGTG = pcg2gtg_by_identity(PCG,
                                target,
                                identifier_list=GENE_IDENTIFIER_SET)

    # TEMP solution because OrganismGraph != OrganismStarGraph
    # make bitscore ordered list of nodes
    bitscore_ordered_nodes = []
    for (tNode, iNode), wt in btGTG.weights.iteritems():
        if tNode == target: bitscore_ordered_nodes.append((wt, iNode))
    bitscore_ordered_nodes.sort()
    #if verbose: print "btGTG::", bitscore_ordered_nodes

    while ntGTG.node_count() > gtg_size:
        # next line causes errors because OrganismGraph != OrganismStarGraph
        # this causes the target node in rare cases to be assigned as the weakest node
        # informant = btGTG.weakest_connected_node()
        (wt, informant) = bitscore_ordered_nodes.pop(0)
        btGTG.del_node(informant)
        ntGTG.del_node(informant)
        if verbose:
            print "btGGT.weakest_connected_node() ==", informant, btGTG.get_ordered_nodes(
            )

    ############################################################################
    if verbose:
        print "ntGTG:", ntGTG.get_ordered_nodes(),
        for node in ntGTG.get_ordered_nodes():
            if node == target: continue
            print "%1.2f" % ntGTG.weights[(target, node)],
        print ""
    ############################################################################

    # detect inwpCBGs which are probably the result of intron alignments
    gtgdiscrepancy_internal_inwpcbg_list = assign_internal_nongene_alignments(
        inwpcbgs, ntGTG)

    # detect inwpCBGs with strong discrepancy to this GTG
    gtgdiscrepancy_inwpcbg_list = assign_gtgdiscrepancy_inwpcbgs(
        inwpcbgs, ntGTG)

    # merge both lists
    if gtgdiscrepancy_internal_inwpcbg_list:
        if not gtgdiscrepancy_inwpcbg_list:
            gtgdiscrepancy_inwpcbg_list.extend(
                gtgdiscrepancy_internal_inwpcbg_list)
        else:
            for inwpcbg in gtgdiscrepancy_internal_inwpcbg_list:
                check_str = str(inwpcbg)
                if check_str not in [
                        str(gtgdiscrCBG)
                        for gtgdiscrCBG in gtgdiscrepancy_inwpcbg_list
                ]:
                    gtgdiscrepancy_inwpcbg_list.append(inwpcbg)

    if not gtgdiscrepancy_inwpcbg_list:
        return False

    # get list of inwpCBGs that have NO discrepancy
    correct_inwpcbg_list = []
    check_str_list = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        check_str_list.append(str(discrinwpCBG))
    for inwpcbg in inwpcbgs:
        if str(inwpcbg) not in check_str_list:
            correct_inwpcbg_list.append(inwpcbg)

    # get all pacbp keys belonging to gtgdiscrepancy inwpcbgs ONLY
    gtgdiscrepancy_pacbpkeys = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        for pacbpkey in discrinwpCBG.pacbps.keys():
            # check if this pacbpkey is occuring in a non-removed inwpCBG
            is_occurring_in_correct_inwpcbg = False
            for inwp in correct_inwpcbg_list:
                if pacbpkey in inwp.pacbps.keys():
                    is_occurring_in_correct_inwpcbg = True
                    break
            # if is_occurring_in_correct_inwpcbg, continue and do not delete
            if is_occurring_in_correct_inwpcbg:
                continue
            # store to gtgdiscrepancy_pacbpkeys when not stored already
            if pacbpkey not in gtgdiscrepancy_pacbpkeys:
                gtgdiscrepancy_pacbpkeys.append(pacbpkey)

    # place all gtgdiscrepancy_pacbpkeys and PacbPORFs in the gtgdiscrepancyPCG
    # and, at the same time, remove from the main PCG
    gtgdiscrepancyPCG = PacbpCollectionGraph(crossdata={},
                                             blastmatrix=PCG._blastmatrix)
    for key in gtgdiscrepancy_pacbpkeys:
        if key not in PCG.pacbps.keys():
            # !?!? TODO why not present in the PCG !?!?!
            # anyway, continue here to avoid KeyError
            # This PacbPORF was to be deleted rigth here,
            # so it is not an extreme disaster. But... scary ;-)
            continue
        (pacbpkey, nodeQ, nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to gtgdiscrepancyPCG
        gtgdiscrepancyPCG.add_node(nodeQ)
        gtgdiscrepancyPCG.add_node(nodeS)
        gtgdiscrepancyPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
        gtgdiscrepancyPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf

        # remove from main PCG
        _delete_pacbp(PCG, key)

    # return gtgdiscrepancyPCG
    return gtgdiscrepancyPCG
def detect_and_remove_single_nonfirst_inwpcbg(inwpcbgs,
                                              PCG,
                                              GENE_IDENTIFIER_SET,
                                              verbose=False):
    """
    Allow deletion of a very shitty, single inwpCBG from the start of the list
    """
    # we need at least 2 inwpCBGs in order to remove one of them
    if len(inwpcbgs) <= 1: return False

    firstInwpCBG = inwpcbgs[0]
    nextInwpCBG = inwpcbgs[1]

    firstNodeList = [ firstInwpCBG.get_organism_nodes(org)[0] for org in\
                firstInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]
    nextNodeList = [ nextInwpCBG.get_organism_nodes(org)[0] for org in\
                nextInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]

    # identical nodes -> do not delete. Only go for very obvious things
    if Set(firstNodeList).intersection(nextNodeList): return False

    ntdistdict = firstInwpCBG.nt_spacing_between_codingblocks([nextInwpCBG])
    tcodedistdict = firstInwpCBG.tcode_spacing_between_codingblocks(
        [nextInwpCBG])

    # make a long list of checks which should be True in case
    # firstInwpCBG is *NOT* the first exon of this gene structure
    check1 = nextInwpCBG.count_orfs_labeled_as_annotated_exon() >\
             firstInwpCBG.count_orfs_labeled_as_annotated_exon()
    check2 = nextInwpCBG.get_bitscore() > firstInwpCBG.get_bitscore()
    check3 = len(nextNodeList) > len(firstNodeList)
    check4 = float(firstInwpCBG.count_orfs_labeled_as_annotated_exon()) /\
             float(len(GENE_IDENTIFIER_SET)) <= 0.33
    if ntdistdict:
        check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\
             MIN_INTERGENIC_NT_LENGTH
    else:
        check5 = False
    if tcodedistdict:
        check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\
             TCODE_MAX_NONCODING
    else:
        check6 = False
    check7 = nextInwpCBG.count_orfs_labeled_as_first_exon() >=\
             firstInwpCBG.count_orfs_labeled_as_first_exon()
    check8 = firstInwpCBG.count_orfs_labeled_as_annotated_exon() == 0
    check9 = nextInwpCBG.get_average_upstream_methionine_pssm_score() >\
             firstInwpCBG.get_average_upstream_methionine_pssm_score()

    checklist = [
        check1, check2, check3, check4, check5, check6, check7, check8, check9
    ]

    ############################################################################
    if verbose or True: print "NonFirst inwpCBG check:", checklist
    ############################################################################

    if checklist.count(False) <= 1:
        nonfirstPCG = PacbpCollectionGraph(crossdata={},
                                           blastmatrix=PCG._blastmatrix)
        # place all PacbPORFs in the nonfirstPCG
        for (pacbpkey, nodeQ,
             nodeS), pacbporf in firstInwpCBG.pacbps.iteritems():
            # add to noncodingnongenePCG
            nonfirstPCG.add_node(nodeQ)
            nonfirstPCG.add_node(nodeS)
            nonfirstPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
            nonfirstPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf
            # remove from main PCG
            _delete_pacbp(PCG, (pacbpkey, nodeQ, nodeS))
        # return nonfirstPCG
        return nonfirstPCG
    else:
        return False
示例#10
0
def detect_and_remove_synteny(inwpcbgs,PCG,GENE_IDENTIFIER_SET,verbose=True):
    """ """
    MIN_OBSERVED_VS_EXPECTED_RATIO = 0.20

    observed_organism_subcombis = []
    syntenic_subinwpcbgs = []

    # detect syntenic genes in MAIN inwpCBGs,
    # without taking strongest informants by GTG analyses
    syntenic_inwpcbgs = assign_syntenic_inwpcbgs(inwpcbgs)

    for syntinwpcbg in syntenic_inwpcbgs:
        syntenic_subinwpcbgs.append(syntinwpcbg)

    for inwpCBG in inwpcbgs:
        # omit inwpCBGs with annotated exons/orfs
        if inwpCBG.count_orfs_labeled_as_annotated_exon() >= 2: continue
        target = inwpCBG._get_target_organism()

        # make a (artificially fully connected) GeneTreeGraph
        gtg = GeneTreeGraph()
        gtg.add_node(target)
        for (pacbpkey,nodeQ,nodeS),pacbporf in inwpCBG.pacbps.iteritems():
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_node(orgS)
        for (pacbpkey,nodeQ,nodeS),pacbporf in inwpCBG.pacbps.iteritems():
            orgQ = inwpCBG.organism_by_node(nodeQ)
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_edge( orgQ, orgS, wt = pacbporf.bitscore )
    
            # make artificially missed edges between the informants
            for org in inwpCBG.organism_set():
                if org not in [orgQ,orgS] and org in GENE_IDENTIFIER_SET:
                    if gtg.has_edge( orgS, org ) and\
                    gtg.weights[(orgS, org)] > pacbporf.bitscore:
                        gtg.set_edge_weight(orgS,org,wt = pacbporf.bitscore)
                    else:
                        gtg.add_edge( orgS, org, wt = pacbporf.bitscore )
    
        # omit (nearly) empty genetreegraphs
        if gtg.node_count() <= 1: continue

        # remove (much) weaker connected nodes as expected from the gtg
        while gtg.get_nodes() and MIN_OBSERVED_VS_EXPECTED_RATIO >\
        min( [ gtg.get_node_weighted_connectivity_observed_vs_expected(node) for node in gtg.get_nodes() ]):
            node = gtg.weakest_connected_node()
            gtg.del_node(node)
    
        # check if already tested before; present in observed_organism_subcombis
        if gtg.get_ordered_nodes() in observed_organism_subcombis: continue
    
        # store to already tested organism subcombinations
        observed_organism_subcombis.append( gtg.get_ordered_nodes() )
    
        # create a subPCG of these organisms
        subPCG = PacbpCollectionGraph(crossdata={},
                    blastmatrix=PCG._blastmatrix)
        for (pacbpkey,nodeQ,nodeS), pacbporf in PCG.pacbps.iteritems():
            (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS
            if orgQ not in gtg.get_nodes(): continue
            if orgS not in gtg.get_nodes(): continue
            subPCG.add_node(nodeQ)
            subPCG.add_node(nodeS)
            subPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
            subPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
    
        # make inwpCBGs of this subPCG
        subinwpcbgs = PCG2inwpCBGS(subPCG)

        # check if there are subinwpcbgs
        if not subinwpcbgs: continue

        ########################################################################
        #if verbose:
        #    print "subPCG organism set:", gtg.get_ordered_nodes()
        #    print_inwpcbgstructure(subinwpcbgs,gtg.get_ordered_nodes())
        ########################################################################
    
        # create a subInwardsPointingCodingBlockGraph of these organisms
        #subinwpCBG = InwardsPointingCodingBlockGraph()
        #for (pacbpkey,nodeQ,nodeS), pacbporf in inwpCBG.pacbps.iteritems():
        #    (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS
        #    if orgQ not in gtg.get_nodes(): continue
        #    if orgS not in gtg.get_nodes(): continue
        #    subinwpCBG.add_node(nodeQ)
        #    subinwpCBG.add_node(nodeS)
        #    subinwpCBG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        #    subinwpCBG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf

        # detect syntenic genes in this subinwpcbgs
        syntenic_inwpcbgs = assign_syntenic_inwpcbgs(subinwpcbgs)

        for syntinwpcbg in syntenic_inwpcbgs:
            syntenic_subinwpcbgs.append(syntinwpcbg)
            ####################################################################
            if verbose:
                print "SYNTENIC!!", syntinwpcbg, syntinwpcbg.get_ordered_nodes()
                for subCBG in subinwpcbgs:
                    print "syntenic in:", subCBG, subCBG.get_ordered_nodes()
            ####################################################################

    if not syntenic_subinwpcbgs:
        return False

    # cleanup all inwpCBGs from the syntenic subInwpCBGs
    syntenic_pacbpkeys = []
    for syntinwpcbg in syntenic_subinwpcbgs:
        node_set = syntinwpcbg.node_set()
        for inwpCBG in inwpcbgs:
            if not node_set.difference(inwpCBG.node_set()):
                for pacbpkey in inwpCBG.pacbps.keys():
                    if pacbpkey not in syntenic_pacbpkeys:
                        syntenic_pacbpkeys.append(pacbpkey)

    # place all syntenic_pacbpkeys and PacbPORFs in the syntenicPCG
    # and, at the same time, remove from the main PCG
    syntenicPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
    for key in syntenic_pacbpkeys:
        (pacbpkey,nodeQ,nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to syntenicPCG
        syntenicPCG.add_node(nodeQ)
        syntenicPCG.add_node(nodeS)
        syntenicPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        syntenicPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
        # remove from main PCG
        _delete_pacbp(PCG,key)

    # return syntenicPCG
    return syntenicPCG