Exemplo n.º 1
0
def _is_intermediate_overlapping_cbg_a_gsg_scaffold_enrichment(gsg,
    cbgA,cbgB,cbgC,minimal_scaffold_aa_enrichment=5):
    """
    """
    if not (gsg and cbgA and cbgB and cbgC):
        # (most likely) cbgA or cbgC is not defined
        # function behavious should be ti return False
        return False
    if not cbgA.mutual_nodes(cbgC):
        # series of CBGs does not represent an suitable
        # gene structure scaffold -> return False
        return False

    # perform this check
    from graph_genestructure import GenestructureOfCodingBlockGraphs
    partGSG = GenestructureOfCodingBlockGraphs(gsg.input)
    partGSG.codingblockgraphs = [ cbgA,cbgB,cbgC ]
    partGSG._GENETREE = gsg._GENETREE
    partOMSRa = partGSG.overall_minimal_spanning_range()
    partGSG.codingblockgraphs = [ cbgA,cbgC ]
    partOMSRb = partGSG.overall_minimal_spanning_range()
    scaffold_enrichments = []
    for node in cbgA.mutual_nodes(cbgC):
        org = gsg.organism_by_node(node)
        scaffold_enrichments.append(
            len(partOMSRa[org]) - len(partOMSRb[org]) >=\
            minimal_scaffold_aa_enrichment
            )
    # check if True in scaffold_enrichments
    if True in scaffold_enrichments:
        return True
    else:
        return False
Exemplo n.º 2
0
    def split_final_cbg_on_spanningrange_difference(self,
        sprdif_min_aa_length=CBG_FINAL_SPRDIF_MIN_AA_LENGTH,
        sprdif_min_node_count=CBG_FINAL_SPRDIF_MIN_NODE_COUNT,
        sprdif_min_gtid_ratio=0.55,
        only_perform_if_stopcodon_tw_ratio_lte=CBG_FINAL_SPRDIF_ONLY_IF_STOP_TW_RATIO_LTE,
        only_preform_if_cbg_id_gte=CBG_FINAL_SPRDIF_ONLY_IF_CBG_ID_GTE ):
        """

        @type  sprdif_min_aa_length: integer
        @param sprdif_min_aa_length: minimal length of the sprdif in aa's
    
        @type  cbg_min_node_count: integer
        @param cbg_min_node_count: minimal number of nodes in a CBG to be elegiable for trying a split

        @type  sprdif_min_gtid_ratio: float
        @param sprdif_min_gtid_ratio:

        @type  only_perform_if_stopcodon_tw_ratio_lte: float 
        @param only_perform_if_stopcodon_tw_ratio_lte: run function only when lastCBG.stopcodongraph.totalweight <= threshold

        @type  only_preform_if_cbg_id_gte: float 
        @param only_preform_if_cbg_id_gte: run function only when lastCBG.genetree.identity() >= threshold

        """
        # get the CBG that is labelled as IS_LAST=True
        current_last = self.get_final_cbg()

        # check if we are alowed to peform this function
        # for groups of genes with very low identity, this function
        # is more likely to decrease the result then to improve the result

        # make AlignedStopCodonGraph
        current_last.align_stop_codons()
        tw_current = current_last._stopcodongraph.total_weight()
        ratio = tw_current / self.EXACT_SG_EDGE_COUNT

        # now check if it is alowed to enter the function: only_perform_if_...
        if only_perform_if_stopcodon_tw_ratio_lte and ratio > only_perform_if_stopcodon_tw_ratio_lte:
            return False
        if only_preform_if_cbg_id_gte and current_last.genetree().identity() < only_preform_if_cbg_id_gte:
            return False
            

        # check for rigth sprdif op requested size; if not => return False
        if not current_last.has_rigth_spanningrange_difference(
                sprdif_min_aa_length=sprdif_min_aa_length,
                sprdif_min_node_count=sprdif_min_node_count):
            # no rigth spanningrange difference -> done & return
            return False

        # make a deepcopy and clear cache of the one that will be processed
        last = deepcopy(current_last)
        last.clear_cache()

        # iteratively split
        splits = last.iteratively_split_codingblock_on_spanningrange_difference(
                side='rigth',
                sprdif_min_aa_length=sprdif_min_aa_length,
                sprdif_min_node_count=sprdif_min_node_count,
                )

        # was the split succesfull?
        if len(splits) == 1:
            # no splits => done here!
            return False

        # when here process the sprdif CBGs
        # 1) cbghmmsearch2pacbpcollection
        # 2) pacbpCollection2acceptedcbgs
        all_accepted_cbgs = []
        # loop over the splits; except for the most left one (the input `last` CBG)
        for splittedCBG in splits[1:]:
            if splittedCBG.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue
            # complete with cbghmmsearch2pacbpcollection

            # get ratio of the GTG of this CBG
            ratio = splittedCBG.genetree().identity() / current_last.genetree().identity()

            # if ratio is bad -> do not perform!
            if sprdif_min_gtid_ratio and ratio < sprdif_min_gtid_ratio:
                continue

            pacbpCollection = cbghmmsearch2pacbpcollection(splittedCBG,self.input,
                    prev=last,
                    pacbp_min_length=sprdif_min_aa_length,
                    hmmsearch_num_hits=3
                    )

            # get list of accepted CBGs
            accepted =  conversion.pacbpCollection2AcceptedCodingBlockGraphs(pacbpCollection,prev=last)
            all_accepted_cbgs.extend( accepted )

        # if no accepted ones -> return False
        if not all_accepted_cbgs: return False

        # order graphs by total weight
        all_accepted_cbgs = ordering.order_graphlist_by_total_weight(all_accepted_cbgs)
        # and re-order on node occurrence: if a neighboring node is incorporated -> more likely!
        all_accepted_cbgs = ordering.reorder_cbgs_on_node_occurrence(all_accepted_cbgs,prev=last)

        # and now try to add the accepted cbgs into the genestructure
        # speedup the process by creating a tinyGSG object of only the last CBG
        # but, set the _GENETREE attribute to the genetree of the main GSG
        from graph_genestructure import GenestructureOfCodingBlockGraphs
        lastGSG = GenestructureOfCodingBlockGraphs(self.input)
        lastGSG.add_codingblock(current_last)
        lastGSG._GENETREE = self._GENETREE
        RETURN_STATUS_CBG_IS_ADDED = False

        for cbgL in all_accepted_cbgs:
            # only Ks CBG graphs are alowed here!
            if cbgL.node_count() != current_last.node_count(): continue

            if lastGSG.add_codingblock(cbgL,only_try_adding=True,
                max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF,
                max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF,
                min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO,
                ):
                # it is addable; prepare for final addition to the genestructure
                lsrCBG = None
                cbgL.IS_SPLITTED     = False
                cbgL.IS_5P_SPLITTED  = False
                cbgL.IS_FIRST        = False
                cbgL.IS_LAST         = True
                current_last.IS_LAST = False
                # if identical nodes -> create a lsrCBG
                if not cbgL.node_set().difference(current_last.get_nodes()):
                    current_last.IS_SPLITTED    = True
                    current_last.IS_3P_SPLITTED = True
                    cbgL.IS_SPLITTED            = True
                    cbgL.IS_5P_SPLITTED         = True
                    lsrCBG = graphAbgp.codingblock_splitting.create_intermediate_lowsimilarity_region(
                            current_last, cbgL )
                    if not lsrCBG.node_count():
                        lsrCBG = None
                # now add the new last CBG
                status = self.add_codingblock(cbgL,
                        max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF,
                        max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF,
                        min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO,
                        )
                status = lastGSG.add_codingblock(cbgL,
                        max_cbg_gtg_topo_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_TOPO_DIF,
                        max_cbg_gtg_abs_dif=self.MAX_CBG_FINAL_SPRDIF_GTG_ABS_DIF,
                        min_cbg_gtg_id_ratio=self.MIN_CBG_FINAL_SPRDIF_GTG_ID_RATIO,
                        )
                # if added, update the return value (RETURN_STATUS_CBG_IS_ADDED)
                if status:
                    RETURN_STATUS_CBG_IS_ADDED = True
                    print cbgL
                    print cbgL.IS_5P_SPLITTED, cbgL.IS_SPLITTED, cbgL.IS_3P_SPLITTED
                # and add the intermediate lsrCBG when available
                if lsrCBG:
                    statusMainGSG = self.add_codingblock(lsrCBG)
                    statusLastGSG = lastGSG.add_codingblock(lsrCBG)
                    print "lsrCBG added:", statusMainGSG, statusLastGSG
            else:
                # not placeable in the genestructure
                pass

        # in exceptional cases, 2 CBGs can be added. In case the node_set() is identical,
        # yet another lsrCBG has to be created in between these 2 new CBGs
        # check this in the main GSG (NOT in the lastGSG; when a lsrCBG is added here,
        # splits are added to the surrounding CBGs. Because call-by-reference, these
        # splits are added to the main GSG (self) too, and adding the same lsrCBG
        # will fail (splitted CBGs are skipped!
        if RETURN_STATUS_CBG_IS_ADDED:
            self.finalize_genestructure()
            if self.join_false_inframe_introns():
               print "EXTRA lsrCBG added!!"
            # recreate interfaces if there is a new one created
            self.create_cbginterfaces()


        # return the return status True|False
        return RETURN_STATUS_CBG_IS_ADDED
Exemplo n.º 3
0
    def replace_scaffold_breaking_cbgs(self,verbose=False):
        """
        (Try) to replace CBG that break the GSG scaffold by other CBGs

        @type  verbose: Boolean
        @param verbose: print status/debugging messages to STDOUT

        @rtype:  Boolean
        @return: Is any CBG replaced?
        """
        # Boolean return value
        scaffold_breaking_cbg_replaced = False

        for cbgpos in self.cbgpositerator(reversed=True)[1:]:
            cbg = self.codingblockgraphs[cbgpos]
            if cbg.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph':
                continue
            if cbg.node_count() < self.EXACT_SG_NODE_COUNT:
                continue

            # loop forwards through the GSG and look for mutual nodes
            identical_nodes = []
            for backwardspos in range(cbgpos+1,len(self)):
                comparecbg = self.codingblockgraphs[backwardspos]
                if cbg.mutual_nodes(comparecbg):
                    identical_nodes.append(True)
                    break
                else:
                    identical_nodes.append(False)
            if not identical_nodes:
                continue    # final CBG -> continue
            elif identical_nodes == [True]:
                continue    # neighboring node has mutual nodes -> continue
            elif identical_nodes.count(True) == 0:
                continue    # no mutual nodes at all -> continue
            else:
                # this is what we are looking for, a list like
                # [ False, ... True ] with >1 False
                # get total_weights of the intermediate CBGs
                tws = [ self.codingblockgraphs[_pos].total_weight() for\
                        _pos in range(cbgpos+1,backwardspos) ]
                ################################################################
                if verbose:
                    print cbgpos, backwardspos, identical_nodes
                    print cbg
                    print tws
                    print comparecbg
                ################################################################

                # first, check if the CBGs are already partially overlapping
                # if so, get rid of this intermediate CBG
                omsrdist = cbg.omsr_distance_between_codingblocks(comparecbg)
                if max(omsrdist.values()) <= 1:
                    # yes, all organisms glue these CBGs perfectly together
                    # just remove this one without further checks
                    cbg._CBGinterface3p        = None
                    comparecbg._CBGinterface5p = None
                    self.codingblockgraphs.__setslice__(
                            cbgpos+1,
                            backwardspos+1,
                            [ comparecbg ] )
                    scaffold_breaking_cbg_replaced = True
                    # go to the next cbg in the list
                    continue

                # do a more eleborate check by trying to create a CBG
                # in this large_scaffold_gap
                from graph_genestructure import GenestructureOfCodingBlockGraphs
                partialGSG = GenestructureOfCodingBlockGraphs(self.input)
                partialGSG.codingblockgraphs = [ cbg, comparecbg ]
                partialGSG._GENETREE = self._GENETREE
                partialGSG.create_large_intermediate_cbg_for_scaffold_gap(
                        sprdif_min_node_count = 2,
                        cbg_min_node_count = self.EXACT_SG_NODE_COUNT,
                        verbose = verbose
                        )
                if len(partialGSG) == 2:
                    ############################################################
                    if verbose: print "NO scaffold CBGs found!"
                    ############################################################
                    pass
                else:
                    new_tws = [ _cbg.total_weight() for _cbg in\
                            partialGSG.codingblockgraphs[1:-1] ]
                    if sum(new_tws) > sum(tws):
                        # replace!
                        self.codingblockgraphs.__setslice__(
                            cbgpos+1,
                            backwardspos,
                            partialGSG.codingblockgraphs[1:-1] )
                        scaffold_breaking_cbg_replaced = True
                    else:
                        pass
                    ############################################################
                    if verbose:
                        if sum(new_tws) > sum(tws):
                            print "REPLACING scaffold-breaking CBGs!!!"
                        else:
                            print "MAINTAINING scaffold-breaking CBGs!!!"
                        for cbg in partialGSG: print cbg
                    ############################################################

        # return if CBGs are removed
        return scaffold_breaking_cbg_replaced
Exemplo n.º 4
0
    def separate_ds_and_us_gsg(self):
        """
        Remove all CBGs that have status IS_IGNORED in separate GSGs

        @attention: LowSimilarityRegionCodingBlockGraph that are not required
                    anymore are deleted!

        @attention: CodingBlockGraphInterface objects around removed CBGs are set to None!

        @rtype:  tuple
        @return: tuple of 3 (empty) GenestructureOfCodingBlockGraphs
        """
        # create empty GSG to place IS_IGNORED CBGs in
        from graph_genestructure import GenestructureOfCodingBlockGraphs
        dsGSG  = GenestructureOfCodingBlockGraphs(self.input)
        usGSG  = GenestructureOfCodingBlockGraphs(self.input)
        etcGSG = GenestructureOfCodingBlockGraphs(self.input)

        # if no CBGs in GSG -> return all empty ones
        if len(self) == 0: return dsGSG, usGSG, etcGSG

        # check if there is any LowSimilarityRegionCodingBlockGraph directly
        # next to a CBG that has status IS_IGNORED. If so, set the lsrCBG
        # status to IS_IGNORED too!
        for pos in range(0,len(self)):
            if not self.codingblockgraphs[pos].IS_IGNORED: continue
            thiscbg = self.codingblockgraphs[pos]
            if pos > 0:
                prevclass = self.codingblockgraphs[pos-1].__class__.__name__
                if prevclass == 'LowSimilarityRegionCodingBlockGraph':
                    # set to IS_IGNORED too!
                    self.codingblockgraphs[pos-1].IS_IGNORED = True
            if pos < len(self)-1:
                nextclass = self.codingblockgraphs[pos+1].__class__.__name__
                if nextclass == 'LowSimilarityRegionCodingBlockGraph':
                    # set to IS_IGNORED too!
                    self.codingblockgraphs[pos+1].IS_IGNORED = True

        # Separate a potential downsteam GSG from this main GSG in dsGSG
        # that means, all IS_IGNORED CBGs until the first that is not IS_IGNORED
        for pos in range(0,len(self)):
            if pos == 0 and not self.codingblockgraphs[pos].IS_IGNORED:
                # first CBG is not IS_IGNORED -> no dsGSG
                break
            if not dsGSG and not self.codingblockgraphs[pos].IS_IGNORED and pos >= 1:
                # place all CBGs ds of this first CBG that is not IS_IGNORED in dsGSG
                for delpos in range(0,pos):
                    dsGSG.codingblockgraphs.append( self.codingblockgraphs.pop(0) )
                for cbg in dsGSG.codingblockgraphs:
                    cbg.IS_IGNORED = False
                break

        # if dsGSG is not empty, fix cbgIFs in the GSG itself and check for
        # now non-sense lsrCBGs in the dsGSG
        if len(dsGSG) > 0:
            # whipe out the cbgIF object on the 5p side of the first CBG
            self.codingblockgraphs[0]._CBGinterface5p = None
            self.codingblockgraphs[0]._forced_5p_ends = {}

            # check dsGSG; if on of its exterior CBGs is an lsrCBG, remove it!
            if dsGSG.codingblockgraphs[0].__class__.__name__ ==\
            'LowSimilarityRegionCodingBlockGraph':
                removed_lsrCBG = dsGSG.codingblockgraphs.pop(0)
            if dsGSG.codingblockgraphs[-1].__class__.__name__ ==\
            'LowSimilarityRegionCodingBlockGraph':
                removed_lsrCBG = dsGSG.codingblockgraphs.pop()


        # separate a potential upstream GSG from this main GSG in usGSG
        # that means, all IS_IGNORED CBGs until the first that is not IS_IGNORED
        for pos in range(len(self)-1,-1,-1):
            if pos == len(self)-1 and not self.codingblockgraphs[pos].IS_IGNORED:
                # first CBG is not IS_IGNORED -> no usGSG
                break
            if not usGSG and not self.codingblockgraphs[pos].IS_IGNORED and pos < len(self)-1:
                # place all CBGs ds of this first CBG that is not IS_IGNORED in usGSG
                for delpos in range(pos+1,len(self)):
                    usGSG.codingblockgraphs.insert(0, self.codingblockgraphs.pop() )
                for cbg in usGSG.codingblockgraphs:
                    cbg.IS_IGNORED = False
                break

        # if usGSG is not empty, fix cbgIFs in the GSG itself and check for
        # now non-sense lsrCBGs in the usGSG
        if len(usGSG) > 0:
            # whipe out the cbgIF object on the 3p side of the last CBG
            self.codingblockgraphs[len(self)-1]._CBGinterface3p = None
            self.codingblockgraphs[len(self)-1]._forced_3p_ends = {}

            # check usGSG; if on of its exterior CBGs is an lsrCBG, remove it!
            if usGSG.codingblockgraphs[0].__class__.__name__ ==\
            'LowSimilarityRegionCodingBlockGraph':
                removed_lsrCBG = usGSG.codingblockgraphs.pop(0)
            if usGSG.codingblockgraphs[-1].__class__.__name__ ==\
            'LowSimilarityRegionCodingBlockGraph':
                removed_lsrCBG = usGSG.codingblockgraphs.pop()


        # check for intermediate IS_IGNORED CBGs in the main GSG and place in etcGSG
        for pos in range(len(self)-1,-1,-1):
            if self.codingblockgraphs[pos].IS_IGNORED:
                # place the IS_IGNORED one in etcGSG
                removed_cbg = self.codingblockgraphs.pop(pos)
                classname   = removed_cbg.__class__.__name__
                if classname != 'LowSimilarityRegionCodingBlockGraph':
                    # only place non-lsrCBGs in the etcGSG
                    # intermediate lsrCBGs that are IS_IGNORED are now nonsense
                    etcGSG.codingblockgraphs.insert(0, removed_cbg )

                # whipe out potential cbgIF objects surrounding this CBG
                if pos > 0:
                    self.codingblockgraphs[pos-1]._CBGinterface3p = None
                    self.codingblockgraphs[pos-1]._forced_3p_ends = {}
                if pos <= len(self):
                    self.codingblockgraphs[pos]._CBGinterface5p = None
                    self.codingblockgraphs[pos]._forced_5p_ends = {}

        # and return dsGSG, usGSG, etcGSG
        return dsGSG, usGSG, etcGSG