예제 #1
0
파일: conversion.py 프로젝트: IanReid/ABFGP
def pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection,gtg=None,prev=None,next=None,
    max_cbg_gtg_topo_dif=None,
    max_cbg_gtg_abs_dif=None,
    min_cbg_gtg_id_ratio=None):
    """
    """
    # make splitted subgraphs from PacbpCollection
    # cbgs must have collection.organism_set_size()-1 nodes
    # and no missing edges are aloued
    # to the number of nodes missing in the input splittedCBG)
    exact_cbg_node_count = pacbpCollection.organism_set_size()
    exact_cbg_edge_count = exact_cbg_node_count - 1
    dpcPacbpCollection = deepcopy(pacbpCollection)
    splitted_subgraphs = pacbpCollection.find_fully_connected_subgraphs(
                edges=exact_cbg_edge_count,
                max_missing_edges=0
                )


    # get pacbps for the splitted subgraphs and update edge weights
    completed_subgraphs = []
    for spl in splitted_subgraphs:
        # only deal with complete CBGs, not incomplete or collections
        if spl.node_count() != exact_cbg_node_count: continue
        if spl.__class__.__name__ == 'PacbpCollectionGraph': continue
        if spl.connectivitysaturation() < 1.0: continue

        # harvest pacbps from the deepcopied PacbpCollection
        spl.harvest_pacbps_from_pacbpcollection(dpcPacbpCollection)
        if not spl.has_overall_minimal_spanning_range(): continue
        if not spl.has_all_pacbps(): continue
        spl.update_edge_weights_by_minimal_spanning_range()
        completed_subgraphs.append(spl)

    # order graphs by total weight
    completed_subgraphs = ordering.order_graphlist_by_total_weight(completed_subgraphs)
    # and re-order on node occurrence: if a neighboring node is incorporated -> more likely!
    completed_subgraphs = ordering.reorder_cbgs_on_node_occurrence(completed_subgraphs,prev=prev,next=next)

    accepted_cbgs = []
    for spl in completed_subgraphs:
        if gtg:
            if max_cbg_gtg_topo_dif:
                topo_dif = gtg.graphalignmentdifference( spl.genetree() )
                if topo_dif > max_cbg_gtg_topo_dif:
                    continue
            if max_cbg_gtg_abs_dif:
                abs_dif  = gtg.absolutegraphalignmentdifference( spl.genetree() )
                if abs_dif > max_cbg_gtg_abs_dif:
                    continue
            if min_cbg_gtg_id_ratio:
                identity_ratio = spl.genetree().identity() / gtg.identity()
                if identity_ratio < min_cbg_gtg_id_ratio:
                    continue
        # if this point is reached: splitted cbg is accepted!
        accepted_cbgs.append(spl)

    # return the accepted_cbgs
    return accepted_cbgs
예제 #2
0
def findmostlikelyCBG2GSGinsert(partialGSG,cbglist,
    order_cbgs_by_total_weight=True,
    reorder_cbgs_on_node_occurrence=True,
    optimizetinyexoninterface=True,
    verbose=False):
    """
    """
    # if no CBG in list -> done!
    if not cbglist: return partialGSG

    # make interfaces in the partialGSG
    created = partialGSG.create_cbginterfaces()
    gsgsize = len(partialGSG)

    if order_cbgs_by_total_weight:
        # order graphs by total weight
        cbglist = ordering.order_cbgraphlist(cbglist)

    if reorder_cbgs_on_node_occurrence:
        # get prev and next cbg from partialGSG
        if len(partialGSG) >= 2:
            prev = partialGSG.codingblockgraphs[0]
            next = partialGSG.codingblockgraphs[-1]
        elif len(partialGSG) == 1:
            prev, next = None, partialGSG.codingblockgraphs[0]
        else:
            prev, next = None, None
        # re-order on node occurrence.
        cbglist  = ordering.reorder_cbgs_on_node_occurrence(cbglist,prev=prev,next=next)

    ####################################################################
    if verbose and partialGSG and cbglist:
        print "elegiable CBGs for addition in partialGSG"
        for cbg in cbglist: print cbg
    ####################################################################

    # now try to insert CBGs in the partialGSG
    # we try several insertions, starting with the most stringest one

    ############################################################
    # (1)   only do both options for the top listed CBG     ####
    #       perfect cbgIF(s), topologically sound CBGs      ####
    #       perfect cbgIF(s), CBG2GTG topology not tested   ####
    ############################################################
    is_added = _place_cbg_in_partialgsg([ cbglist[0] ],partialGSG,
            omit_conditional_addition=False,
            optimizetinyexoninterface=optimizetinyexoninterface,
            verbose=verbose )
    is_added = _place_cbg_in_partialgsg([ cbglist[0] ],partialGSG,
            omit_conditional_addition=True,
            optimizetinyexoninterface=optimizetinyexoninterface,
            verbose=verbose )

    ############################################################
    # (2)   perfect cbgIF(s), topologically sound CBGs      ####
    ############################################################
    is_added = _place_cbg_in_partialgsg(cbglist,partialGSG,
            omit_conditional_addition=False,
            optimizetinyexoninterface=optimizetinyexoninterface,
            verbose=verbose )

    ############################################################
    # (3)   perfect cbgIF(s), CBG2GTG topology not tested   ####
    ############################################################
    is_added = _place_cbg_in_partialgsg(cbglist,partialGSG,
            omit_conditional_addition=True,
            optimizetinyexoninterface=optimizetinyexoninterface,
            verbose=verbose )

    # when partialGSG is not enlarged, its cbgIFs are likely
    # distorted in the proces of trying novel CBG inserts.
    # Recreate the cbgIFs here
    if len(partialGSG) == gsgsize:
        if gsgsize >=2:
            partialGSG.clear_central_cbginterfaces()
            partialGSG.create_cbginterfaces()
            if gsgsize == 2:
                # most obvious case of 2 CBGs in the GSG
                partialGSG.codingblockgraphs[0].IS_3P_SPLITTED = False
                partialGSG.codingblockgraphs[1].IS_5P_SPLITTED = False
                if not partialGSG.codingblockgraphs[0].IS_5P_SPLITTED:
                    partialGSG.codingblockgraphs[0].IS_SPLITTED = False
                if not partialGSG.codingblockgraphs[1].IS_3P_SPLITTED:
                    partialGSG.codingblockgraphs[1].IS_SPLITTED = False
        else:
            # a partialGSG of only a single CBG (first or last CBG)
            partialGSG.codingblockgraphs[0]._CBGinterface5p = None
            partialGSG.codingblockgraphs[0]._CBGinterface3p = None

    # Done with this function. Return the (incremented) partialGSG
    return partialGSG
    def split_final_cbg_on_spanningrange_difference(self,
        sprdif_min_aa_length=CBG_FINAL_SPRDIF_MIN_AA_LENGTH,
        sprdif_min_node_count=CBG_FINAL_SPRDIF_MIN_NODE_COUNT,
        sprdif_min_gtid_ratio=0.55,
        only_perform_if_stopcodon_tw_ratio_lte=CBG_FINAL_SPRDIF_ONLY_IF_STOP_TW_RATIO_LTE,
        only_preform_if_cbg_id_gte=CBG_FINAL_SPRDIF_ONLY_IF_CBG_ID_GTE ):
        """

        @type  sprdif_min_aa_length: integer
        @param sprdif_min_aa_length: minimal length of the sprdif in aa's
    
        @type  cbg_min_node_count: integer
        @param cbg_min_node_count: minimal number of nodes in a CBG to be elegiable for trying a split

        @type  sprdif_min_gtid_ratio: float
        @param sprdif_min_gtid_ratio:

        @type  only_perform_if_stopcodon_tw_ratio_lte: float 
        @param only_perform_if_stopcodon_tw_ratio_lte: run function only when lastCBG.stopcodongraph.totalweight <= threshold

        @type  only_preform_if_cbg_id_gte: float 
        @param only_preform_if_cbg_id_gte: run function only when lastCBG.genetree.identity() >= threshold

        """
        # get the CBG that is labelled as IS_LAST=True
        current_last = self.get_final_cbg()

        # check if we are alowed to peform this function
        # for groups of genes with very low identity, this function
        # is more likely to decrease the result then to improve the result

        # make AlignedStopCodonGraph
        current_last.align_stop_codons()
        tw_current = current_last._stopcodongraph.total_weight()
        ratio = tw_current / self.EXACT_SG_EDGE_COUNT

        # now check if it is alowed to enter the function: only_perform_if_...
        if only_perform_if_stopcodon_tw_ratio_lte and ratio > only_perform_if_stopcodon_tw_ratio_lte:
            return False
        if only_preform_if_cbg_id_gte and current_last.genetree().identity() < only_preform_if_cbg_id_gte:
            return False
            

        # check for rigth sprdif op requested size; if not => return False
        if not current_last.has_rigth_spanningrange_difference(
                sprdif_min_aa_length=sprdif_min_aa_length,
                sprdif_min_node_count=sprdif_min_node_count):
            # no rigth spanningrange difference -> done & return
            return False

        # make a deepcopy and clear cache of the one that will be processed
        last = deepcopy(current_last)
        last.clear_cache()

        # iteratively split
        splits = last.iteratively_split_codingblock_on_spanningrange_difference(
                side='rigth',
                sprdif_min_aa_length=sprdif_min_aa_length,
                sprdif_min_node_count=sprdif_min_node_count,
                )

        # was the split succesfull?
        if len(splits) == 1:
            # no splits => done here!
            return False

        # when here process the sprdif CBGs
        # 1) cbghmmsearch2pacbpcollection
        # 2) pacbpCollection2acceptedcbgs
        all_accepted_cbgs = []
        # loop over the splits; except for the most left one (the input `last` CBG)
        for splittedCBG in splits[1:]:
            if splittedCBG.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue
            # complete with cbghmmsearch2pacbpcollection

            # get ratio of the GTG of this CBG
            ratio = splittedCBG.genetree().identity() / current_last.genetree().identity()

            # if ratio is bad -> do not perform!
            if sprdif_min_gtid_ratio and ratio < sprdif_min_gtid_ratio:
                continue

            pacbpCollection = cbghmmsearch2pacbpcollection(splittedCBG,self.input,
                    prev=last,
                    pacbp_min_length=sprdif_min_aa_length,
                    hmmsearch_num_hits=3
                    )

            # get list of accepted CBGs
            accepted =  conversion.pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection,prev=last)
            all_accepted_cbgs.extend( accepted )

        # if no accepted ones -> return False
        if not all_accepted_cbgs: return False

        # order graphs by total weight
        all_accepted_cbgs = ordering.order_graphlist_by_total_weight(all_accepted_cbgs)
        # and re-order on node occurrence: if a neighboring node is incorporated -> more likely!
        all_accepted_cbgs = ordering.reorder_cbgs_on_node_occurrence(all_accepted_cbgs,prev=last)

        # and now try to add the accepted cbgs into the genestructure
        # speedup the process by creating a tinyGSG object of only the last CBG
        # but, set the _GENETREE attribute to the genetree of the main GSG
        from graph_genestructure import GenestructureOfCodingBlockGraphs
        lastGSG = GenestructureOfCodingBlockGraphs(self.input)
        lastGSG.add_codingblock(current_last)
        lastGSG._GENETREE = self._GENETREE
        RETURN_STATUS_CBG_IS_ADDED = False

        for cbgL in all_accepted_cbgs:
            # only Ks CBG graphs are alowed here!
            if cbgL.node_count() != current_last.node_count(): continue

            if lastGSG.add_codingblock(cbgL,only_try_adding=True,
                max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF,
                max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF,
                min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO,
                ):
                # it is addable; prepare for final addition to the genestructure
                lsrCBG = None
                cbgL.IS_SPLITTED     = False
                cbgL.IS_5P_SPLITTED  = False
                cbgL.IS_FIRST        = False
                cbgL.IS_LAST         = True
                current_last.IS_LAST = False
                # if identical nodes -> create a lsrCBG
                if not cbgL.node_set().difference(current_last.get_nodes()):
                    current_last.IS_SPLITTED    = True
                    current_last.IS_3P_SPLITTED = True
                    cbgL.IS_SPLITTED            = True
                    cbgL.IS_5P_SPLITTED         = True
                    lsrCBG = graphAbgp.codingblock_splitting.create_intermediate_lowsimilarity_region(
                            current_last, cbgL )
                    if not lsrCBG.node_count():
                        lsrCBG = None
                # now add the new last CBG
                status = self.add_codingblock(cbgL,
                        max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF,
                        max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF,
                        min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO,
                        )
                status = lastGSG.add_codingblock(cbgL,
                        max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF,
                        max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF,
                        min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO,
                        )
                # if added, update the return value (RETURN_STATUS_CBG_IS_ADDED)
                if status:
                    RETURN_STATUS_CBG_IS_ADDED = True
                    print cbgL
                    print cbgL.IS_5P_SPLITTED, cbgL.IS_SPLITTED, cbgL.IS_3P_SPLITTED
                # and add the intermediate lsrCBG when available
                if lsrCBG:
                    statusMainGSG = self.add_codingblock(lsrCBG)
                    statusLastGSG = lastGSG.add_codingblock(lsrCBG)
                    print "lsrCBG added:", statusMainGSG, statusLastGSG
            else:
                # not placeable in the genestructure
                pass

        # in exceptional cases, 2 CBGs can be added. In case the node_set() is identical,
        # yet another lsrCBG has to be created in between these 2 new CBGs
        # check this in the main GSG (NOT in the lastGSG; when a lsrCBG is added here,
        # splits are added to the surrounding CBGs. Because call-by-reference, these
        # splits are added to the main GSG (self) too, and adding the same lsrCBG
        # will fail (splitted CBGs are skipped!
        if RETURN_STATUS_CBG_IS_ADDED:
            self.finalize_genestructure()
            if self.join_false_inframe_introns():
               print "EXTRA lsrCBG added!!"
            # recreate interfaces if there is a new one created
            self.create_cbginterfaces()


        # return the return status True|False
        return RETURN_STATUS_CBG_IS_ADDED
예제 #4
0
def pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection,
                                              gtg=None,
                                              prev=None,
                                              next=None,
                                              max_cbg_gtg_topo_dif=None,
                                              max_cbg_gtg_abs_dif=None,
                                              min_cbg_gtg_id_ratio=None):
    """
    """
    # make splitted subgraphs from PacbpCollection
    # cbgs must have collection.organism_set_size()-1 nodes
    # and no missing edges are aloued
    # to the number of nodes missing in the input splittedCBG)
    exact_cbg_node_count = pacbpCollection.organism_set_size()
    exact_cbg_edge_count = exact_cbg_node_count - 1
    dpcPacbpCollection = deepcopy(pacbpCollection)
    splitted_subgraphs = pacbpCollection.find_fully_connected_subgraphs(
        edges=exact_cbg_edge_count, max_missing_edges=0)

    # get pacbps for the splitted subgraphs and update edge weights
    completed_subgraphs = []
    for spl in splitted_subgraphs:
        # only deal with complete CBGs, not incomplete or collections
        if spl.node_count() != exact_cbg_node_count: continue
        if spl.__class__.__name__ == 'PacbpCollectionGraph': continue
        if spl.connectivitysaturation() < 1.0: continue

        # harvest pacbps from the deepcopied PacbpCollection
        spl.harvest_pacbps_from_pacbpcollection(dpcPacbpCollection)
        if not spl.has_overall_minimal_spanning_range(): continue
        if not spl.has_all_pacbps(): continue
        spl.update_edge_weights_by_minimal_spanning_range()
        completed_subgraphs.append(spl)

    # order graphs by total weight
    completed_subgraphs = ordering.order_graphlist_by_total_weight(
        completed_subgraphs)
    # and re-order on node occurrence: if a neighboring node is incorporated -> more likely!
    completed_subgraphs = ordering.reorder_cbgs_on_node_occurrence(
        completed_subgraphs, prev=prev, next=next)

    accepted_cbgs = []
    for spl in completed_subgraphs:
        if gtg:
            if max_cbg_gtg_topo_dif:
                topo_dif = gtg.graphalignmentdifference(spl.genetree())
                if topo_dif > max_cbg_gtg_topo_dif:
                    continue
            if max_cbg_gtg_abs_dif:
                abs_dif = gtg.absolutegraphalignmentdifference(spl.genetree())
                if abs_dif > max_cbg_gtg_abs_dif:
                    continue
            if min_cbg_gtg_id_ratio:
                identity_ratio = spl.genetree().identity() / gtg.identity()
                if identity_ratio < min_cbg_gtg_id_ratio:
                    continue
        # if this point is reached: splitted cbg is accepted!
        accepted_cbgs.append(spl)

    # return the accepted_cbgs
    return accepted_cbgs