示例#1
0
def create_pacbpcollectiongraph_from_crossdata(
    crossdata,
    increment_to_graph=None):
    """
    Create (incremental) PacbpCollectionGraph from crossdata dict structure

    @type  crossdata: dict
    @param crossdata: crossdata <dict data structure>

    @type  increment_to_graph: None or PacbpCollectionGraph
    @param increment_to_graph: when applied, increment crossdata to applied PCG

    @rtype  g: PacbpCollectionGraph
    @return g: PacbpCollectionGraph
    """
    if increment_to_graph:
        # add new nodes to existing graph
        g = increment_to_graph
    else:
        # create a new blank graph
        from graphAbgp import PacbpCollectionGraph
        g = PacbpCollectionGraph()

    for (orgA,orgB) in crossdata.keys():
        keys = crossdata[(orgA,orgB)]['accepted_pacbs'].keys()
        # sort keys in order to start with highest bitscore
        keys.sort()
        keys.reverse()
        for key in keys:
            (bitscore,lenght,pointerA,pointerB) = key
            nodeA = (orgA,pointerA)
            nodeB = (orgB,pointerB)
            # check if (org,ORF) node exist already
            if nodeA not in g.get_nodes(): g.add_node(nodeA)
            if nodeB not in g.get_nodes(): g.add_node(nodeB)
            if g.has_edge(nodeA,nodeB):
                wt = g.get_edge_weight(nodeA,nodeB)
                if bitscore > wt:
                    g.set_edge_weight(nodeA,nodeB,bitscore)
                else:
                    pass
            else:
                # and create a new edge
                g.add_edge(nodeA,nodeB,wt=bitscore)
    # ready!
    return g
示例#2
0
def get_reverse_cbg(cbg,frame,verbose=False):
    """
    Get the ReversecomplementCodingBlockGraph in requested frame of this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to reversecomplement

    @type  frame: integer
    @param frame: 0,1 or 2

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  ReversecomplementCodingBlockGraph or None
    @return: ReversecomplementCodingBlockGraph (when existing) or None
    """
    min_orf_length = (cbg.omsrlength()/2)*3
    orfs = get_reverse_strand_orfsets(cbg,frame,min_orf_length=min_orf_length)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        fname = "%s_reversecbg_%s.mfa" % (org,cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(),fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print [len(o.protein_sequence) for o in orfs[org].orfs ]
        ########################################################################

    revpacbps = {}
    for orgQ,orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        revpacbporfs = {}
        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,orfQ.protein_sequence,
                        dbname="./"+blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfS = orfs[orgS].get_orf_by_id(alignment.title.replace(">",""))
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]
                # skip if hsp is very short
                if len(hsp.query) < cbg.omsrlength()/2: continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                    ###pacbporf.print_protein_and_dna()
                ################################################################

                nodeQ = ( orgQ, orfQ.protein_startPY )
                nodeS = ( orgS, orfS.protein_startPY )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print [ pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values() ]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count()-1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
            edges=cbg.node_count()-1 , max_missing_edges=0 )
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    cbgList.remove_all_but_complete_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    ############################################################################
    if verbose:
        for revcbg in cbgList:
            print "revCBG:", revcbg
    ############################################################################

    if not cbgList:
        # no CBG on the reverse strand
        return None
    else:
        # return the highest scoring CBG as a ReversecomlementCodingBlockGraph
        return CodingBlockGraph2ReversecomlementCodingBlockGraph(
                cbgList.codingblockgraphs[0])
示例#3
0
def get_frameshifted_cbg(cbg, input, verbose=True):
    """
    Get a CBG with frameshifts (in some of if Orfs) compared to this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to check for frameshifts

    @type  input: dict
    @param input: input <dict data structure> with lists of Orfs

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  CodingBlockGraph or None
    @return: CodingBlockGraph (when existing) or None
    """

    # get elegiable lists of Orfs
    orfs = _get_elegiable_frameshift_orfsets(cbg, input)

    # check how many Orfs are elgiable...
    if sum([len(l.orfs) for l in orfs.values()]) == cbg.node_count():
        # no frameshift possible here...
        return None

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        # REMAP fastaheaders as ids to retrieve the Orfs after blast..
        for orf in orfs[org].orfs:
            orf.fastaheader = str(orf.id)
        fname = "%s_frameshiftcbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[orf.id for orf in orfs[org].orfs],
            print[str(orf) for orf in orfs[org].orfs]
        ########################################################################

    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfid = alignment.title.replace(">", "").split(" ")[0].replace(
                    "_", "")
                orfS = orfs[orgS].get_orf_by_id(int(orfid))

                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                if nodeQ in cbg.get_nodes() and nodeS in cbg.get_nodes():
                    pacbporf = cbg.get_pacbps_by_nodes(node1=nodeQ,
                                                       node2=nodeS)[0]

                else:
                    # take only the *best* HSP (highest scoring first one)
                    hsp = alignment.hsps[0]

                    # correct to absolute positions
                    hsp.query_start = hsp.query_start + orfQ.protein_startPY
                    hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                    # initialize the PacbP
                    pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)
                    ############################################################
                    if verbose: print "NEW:", pacbporf
                    ############################################################

                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        ############################################################
        if verbose: print "org_set_size() PCG < CBG"
        ############################################################
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(cbg.node_count())
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_graphlist_by_total_weight_and_identity()

    ############################################################################
    if verbose:
        print "FScbgs (%s)" % len(cbgList)
        for fscbg in cbgList:
            print fscbg
    ############################################################################

    if not cbgList:
        # no (better) frameshifted CBG
        return None
    elif cbgList and not cbgList[0].node_set().symmetric_difference(
            cbg.node_set()):
        # best CBG is not frameshifted, but CBG itself
        return None
    else:
        # score the difference between the frameshifted and current CBG
        score_cbg = cbg.total_weight() * cbg.omsr_identityscore()
        score_fscbg = cbgList[0].total_weight(
        ) * cbgList[0].omsr_identityscore()
        # check overlap between the frameshifted and current CBG
        a, b, c, d, e, f, g = relatively_positioned_towards(cbgList[0], cbg)

        ########################################################################
        if verbose:
            print "CBG", cbg
            cbg.printmultiplealignment()
            for fscbg in cbgList:
                print "fsCBG:", fscbg
                fscbg.printmultiplealignment()
        ########################################################################

        if (c, d) == ((0, 0, 1), (1, 0, 0)) or (c, d) == ((0, 0, 1),
                                                          (1, 0, 0)):
            # CBG and frameshifted CBG do not share a single AA overlap...
            # This does not represent a frameshifted CBG as we searched for
            return False
        elif score_fscbg > score_cbg:
            # return the highest scoring, frameshifted CBG
            return cbgList[0]
        else:
            # no, still not convinced that this is a frameshifted CBG
            return False
def detect_and_remove_single_nonfinal_inwpcbg(inwpcbgs,PCG,GENE_IDENTIFIER_SET,
    verbose=False):
    """
    Allow deletion of a very shitty, single inwpCBG from the end of the list
    """
    # we need at least 2 inwpCBGs in order to remove one of them
    if len(inwpcbgs) <= 1: return False

    lastInwpCBG = inwpcbgs[-1]
    prevInwpCBG = inwpcbgs[-2]

    lastNodeList = [ lastInwpCBG.get_organism_nodes(org)[0] for org in\
                lastInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]
    prevNodeList = [ prevInwpCBG.get_organism_nodes(org)[0] for org in\
                prevInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]

    # identical nodes -> do not delete. Only go for very obvious things
    if Set(lastNodeList).intersection(prevNodeList): return False

    ntdistdict    = prevInwpCBG.nt_spacing_between_codingblocks([lastInwpCBG])
    tcodedistdict = prevInwpCBG.tcode_spacing_between_codingblocks([lastInwpCBG])

    check1 = prevInwpCBG.count_orfs_labeled_as_annotated_exon() >\
             lastInwpCBG.count_orfs_labeled_as_annotated_exon() 
    check2 = prevInwpCBG.get_bitscore() > lastInwpCBG.get_bitscore() 
    check3 = len(prevNodeList) > len(lastNodeList)
    check4 = float(lastInwpCBG.count_orfs_labeled_as_annotated_exon()) /\
             float(len(GENE_IDENTIFIER_SET)) <= 0.33
    if ntdistdict:
        check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\
             MIN_INTERGENIC_NT_LENGTH
    else:
        check5 = False
    if tcodedistdict:
        check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\
             TCODE_MAX_NONCODING
    else:
        check6 = False
    check7 = prevInwpCBG.get_projected_tailing_stop_aa_difference() <\
             lastInwpCBG.get_projected_tailing_stop_aa_difference()
    check8 = prevInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()<\
             lastInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()

    checklist = [check1,check2,check3,check4,check5,check6,check7,check8]

    ############################################################################
    if verbose: print "NonFinal inwpCBG check:", checklist
    ############################################################################
   
    if checklist.count(False) == 0:
        nonfinalPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
        # place all PacbPORFs in the nonfinalPCG
        for (pacbpkey,nodeQ,nodeS), pacbporf in lastInwpCBG.pacbps.iteritems():
            # add to noncodingnongenePCG
            nonfinalPCG.add_node(nodeQ)
            nonfinalPCG.add_node(nodeS)
            nonfinalPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
            nonfinalPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
            # remove from main PCG
            _delete_pacbp(PCG,(pacbpkey,nodeQ,nodeS))
        # return nonfinalPCG
        return nonfinalPCG
    else:
        return False
def detect_and_remove_utrornonegene_inwpcbgs(inwpcbgs,PCG,verbose=True):
    """ """

    # if empty list or empty PCG provided: return False
    if not inwpcbgs or not PCG or PCG.node_count() == 0: return False

    # MAKE SHURE ALL Orfs HAVE PREDICTED TSS SITES!!
    for inwpCBG in inwpcbgs: inwpCBG.scan_orfs_for_pssm_tss(min_pssm_score=TSS_MIN_PSSM_SCORE)

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # detect inwpCBGs which are most likely 5' and 3' non coding or non gene
    ncng_5p_list = assign_utrornongene5p_inwpcbgs(inwpcbgs)
    ncng_3p_list = assign_utrornongene3p_inwpcbgs(inwpcbgs)
    ncng_list = ncng_5p_list
    ncng_list.extend(ncng_3p_list)

    # return False in no inwpcbgs are assigned
    if not ncng_list: return False

    # get list of inwpCBGs that are NON ncng
    correct_inwpcbg_list = []
    check_str_list = []
    for discrinwpCBG in ncng_list:
        check_str_list.append( str(discrinwpCBG) )
    for inwpcbg in inwpcbgs:
        if str(inwpcbg) not in check_str_list:
            correct_inwpcbg_list.append( inwpcbg )

    # get all pacbp keys belonging to noncoding / nongene inwpcbgs ONLY
    ncng_pacbpkeys = []
    for ncnginwpCBG in ncng_list:
        for pacbpkey in ncnginwpCBG.pacbps.keys():
            # check if this pacbpkey is occuring in a non-removed inwpCBG
            is_occurring_in_correct_inwpcbg = False
            for inwp in correct_inwpcbg_list:
                if pacbpkey in inwp.pacbps.keys():
                    is_occurring_in_correct_inwpcbg = True
                    break
            # if is_occurring_in_correct_inwpcbg, continue and do not delete
            if is_occurring_in_correct_inwpcbg:
                continue
            # store to gtgdiscrepancy_pacbpkeys when not stored already
            if pacbpkey not in ncng_pacbpkeys:
                ncng_pacbpkeys.append(pacbpkey)


    # place all ncng_pacbpkeys and PacbPORFs in the noncodingnongenePCG
    # and, at the same time, remove from the main PCG
    noncodingnongenePCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
    for key in ncng_pacbpkeys:
        (pacbpkey,nodeQ,nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to noncodingnongenePCG
        noncodingnongenePCG.add_node(nodeQ)
        noncodingnongenePCG.add_node(nodeS)
        noncodingnongenePCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        noncodingnongenePCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
        # remove from main PCG
        _delete_pacbp(PCG,key)

    # return noncodingnongenePCG
    return noncodingnongenePCG
def detect_and_remove_gtgdiscrepancy(inwpcbgs,PCG,GENE_IDENTIFIER_SET,verbose=True):
    """ """

    # if empty list or empty PCG provided: return False
    if not inwpcbgs or not PCG or PCG.node_count() == 0: return False

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # Make *the* GTG of the strongest X informant species
    # X depends on the maximum number of gene informants (GENE_IDENTIFIER_SET);
    # unigene informants are not taken into account here.
    # X is defined here by:
    # -- at least 3 informants (for very small number of informants)
    # -- optimally half of the total numers of informants
    # -- at most 8 informants
    min_gtg_node_count = 3 + 1
    max_gtg_node_count = 8 + 1
    gtg_size = min([(len(GENE_IDENTIFIER_SET)-1)/2, max_gtg_node_count])
    gtg_size = max([min_gtg_node_count,gtg_size])

    btGTG = pcg2gtg_by_bitscore(PCG,target,identifier_list=GENE_IDENTIFIER_SET)
    ntGTG = pcg2gtg_by_identity(PCG,target,identifier_list=GENE_IDENTIFIER_SET)

    # TEMP solution because OrganismGraph != OrganismStarGraph
    # make bitscore ordered list of nodes
    bitscore_ordered_nodes = []
    for (tNode,iNode),wt in btGTG.weights.iteritems():
        if tNode==target: bitscore_ordered_nodes.append( ( wt, iNode ) )
    bitscore_ordered_nodes.sort() 
    #if verbose: print "btGTG::", bitscore_ordered_nodes

    while ntGTG.node_count() > gtg_size:
        # next line causes errors because OrganismGraph != OrganismStarGraph
        # this causes the target node in rare cases to be assigned as the weakest node
        # informant = btGTG.weakest_connected_node()
        (wt,informant) = bitscore_ordered_nodes.pop(0)
        btGTG.del_node(informant)
        ntGTG.del_node(informant)
        if verbose: print "btGGT.weakest_connected_node() ==", informant, btGTG.get_ordered_nodes()

    ############################################################################
    if verbose:
        print "ntGTG:", ntGTG.get_ordered_nodes(), 
        for node in ntGTG.get_ordered_nodes():
            if node == target: continue
            print "%1.2f" % ntGTG.weights[(target,node)],
        print ""
    ############################################################################

    # detect inwpCBGs which are probably the result of intron alignments
    gtgdiscrepancy_internal_inwpcbg_list = assign_internal_nongene_alignments(inwpcbgs,ntGTG)


    # detect inwpCBGs with strong discrepancy to this GTG
    gtgdiscrepancy_inwpcbg_list = assign_gtgdiscrepancy_inwpcbgs(inwpcbgs,ntGTG)

    # merge both lists
    if gtgdiscrepancy_internal_inwpcbg_list:
        if not gtgdiscrepancy_inwpcbg_list:
            gtgdiscrepancy_inwpcbg_list.extend(gtgdiscrepancy_internal_inwpcbg_list)
        else:
            for inwpcbg in gtgdiscrepancy_internal_inwpcbg_list:
                check_str = str(inwpcbg)
                if check_str not in [ str(gtgdiscrCBG) for gtgdiscrCBG in gtgdiscrepancy_inwpcbg_list ]:
                    gtgdiscrepancy_inwpcbg_list.append( inwpcbg )

    if not gtgdiscrepancy_inwpcbg_list:
        return False

    # get list of inwpCBGs that have NO discrepancy
    correct_inwpcbg_list = []
    check_str_list = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        check_str_list.append( str(discrinwpCBG) )
    for inwpcbg in inwpcbgs:
        if str(inwpcbg) not in check_str_list:
            correct_inwpcbg_list.append( inwpcbg )

    # get all pacbp keys belonging to gtgdiscrepancy inwpcbgs ONLY
    gtgdiscrepancy_pacbpkeys = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        for pacbpkey in discrinwpCBG.pacbps.keys():
            # check if this pacbpkey is occuring in a non-removed inwpCBG
            is_occurring_in_correct_inwpcbg = False
            for inwp in correct_inwpcbg_list:
                if pacbpkey in inwp.pacbps.keys():
                    is_occurring_in_correct_inwpcbg = True
                    break
            # if is_occurring_in_correct_inwpcbg, continue and do not delete
            if is_occurring_in_correct_inwpcbg:
                continue
            # store to gtgdiscrepancy_pacbpkeys when not stored already
            if pacbpkey not in gtgdiscrepancy_pacbpkeys:
                gtgdiscrepancy_pacbpkeys.append(pacbpkey)


    # place all gtgdiscrepancy_pacbpkeys and PacbPORFs in the gtgdiscrepancyPCG
    # and, at the same time, remove from the main PCG
    gtgdiscrepancyPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
    for key in gtgdiscrepancy_pacbpkeys:
        if key not in PCG.pacbps.keys():
            # !?!? TODO why not present in the PCG !?!?!
            # anyway, continue here to avoid KeyError
            # This PacbPORF was to be deleted rigth here,
            # so it is not an extreme disaster. But... scary ;-)
            continue
        (pacbpkey,nodeQ,nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to gtgdiscrepancyPCG
        gtgdiscrepancyPCG.add_node(nodeQ)
        gtgdiscrepancyPCG.add_node(nodeS)
        gtgdiscrepancyPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        gtgdiscrepancyPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf

        # remove from main PCG
        _delete_pacbp(PCG,key)


    # return gtgdiscrepancyPCG
    return gtgdiscrepancyPCG
def detect_and_remove_single_nonfirst_inwpcbg(inwpcbgs,PCG,GENE_IDENTIFIER_SET,
    verbose=False):
    """
    Allow deletion of a very shitty, single inwpCBG from the start of the list
    """
    # we need at least 2 inwpCBGs in order to remove one of them
    if len(inwpcbgs) <= 1: return False

    firstInwpCBG = inwpcbgs[0]
    nextInwpCBG = inwpcbgs[1]

    firstNodeList = [ firstInwpCBG.get_organism_nodes(org)[0] for org in\
                firstInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]
    nextNodeList = [ nextInwpCBG.get_organism_nodes(org)[0] for org in\
                nextInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]

    # identical nodes -> do not delete. Only go for very obvious things
    if Set(firstNodeList).intersection(nextNodeList): return False

    ntdistdict    = firstInwpCBG.nt_spacing_between_codingblocks([nextInwpCBG])
    tcodedistdict = firstInwpCBG.tcode_spacing_between_codingblocks([nextInwpCBG])

    # make a long list of checks which should be True in case
    # firstInwpCBG is *NOT* the first exon of this gene structure
    check1 = nextInwpCBG.count_orfs_labeled_as_annotated_exon() >\
             firstInwpCBG.count_orfs_labeled_as_annotated_exon() 
    check2 = nextInwpCBG.get_bitscore() > firstInwpCBG.get_bitscore() 
    check3 = len(nextNodeList) > len(firstNodeList)
    check4 = float(firstInwpCBG.count_orfs_labeled_as_annotated_exon()) /\
             float(len(GENE_IDENTIFIER_SET)) <= 0.33
    if ntdistdict:
        check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\
             MIN_INTERGENIC_NT_LENGTH
    else:
        check5 = False
    if tcodedistdict:
        check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\
             TCODE_MAX_NONCODING
    else:
        check6 = False
    check7 = nextInwpCBG.count_orfs_labeled_as_first_exon() >=\
             firstInwpCBG.count_orfs_labeled_as_first_exon()
    check8 = firstInwpCBG.count_orfs_labeled_as_annotated_exon() == 0 
    check9 = nextInwpCBG.get_average_upstream_methionine_pssm_score() >\
             firstInwpCBG.get_average_upstream_methionine_pssm_score() 

    checklist = [check1,check2,check3,check4,check5,check6,check7,check8,check9]

    ############################################################################
    if verbose or True: print "NonFirst inwpCBG check:", checklist
    ############################################################################
   
    if checklist.count(False) <= 1:
        nonfirstPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
        # place all PacbPORFs in the nonfirstPCG
        for (pacbpkey,nodeQ,nodeS), pacbporf in firstInwpCBG.pacbps.iteritems():
            # add to noncodingnongenePCG
            nonfirstPCG.add_node(nodeQ)
            nonfirstPCG.add_node(nodeS)
            nonfirstPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
            nonfirstPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
            # remove from main PCG
            _delete_pacbp(PCG,(pacbpkey,nodeQ,nodeS))
        # return nonfirstPCG
        return nonfirstPCG
    else:
        return False
示例#8
0
def detect_and_remove_synteny(inwpcbgs,
                              PCG,
                              GENE_IDENTIFIER_SET,
                              verbose=True):
    """ """
    MIN_OBSERVED_VS_EXPECTED_RATIO = 0.20

    observed_organism_subcombis = []
    syntenic_subinwpcbgs = []

    # detect syntenic genes in MAIN inwpCBGs,
    # without taking strongest informants by GTG analyses
    syntenic_inwpcbgs = assign_syntenic_inwpcbgs(inwpcbgs)

    for syntinwpcbg in syntenic_inwpcbgs:
        syntenic_subinwpcbgs.append(syntinwpcbg)

    for inwpCBG in inwpcbgs:
        # omit inwpCBGs with annotated exons/orfs
        if inwpCBG.count_orfs_labeled_as_annotated_exon() >= 2: continue
        target = inwpCBG._get_target_organism()

        # make a (artificially fully connected) GeneTreeGraph
        gtg = GeneTreeGraph()
        gtg.add_node(target)
        for (pacbpkey, nodeQ, nodeS), pacbporf in inwpCBG.pacbps.iteritems():
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_node(orgS)
        for (pacbpkey, nodeQ, nodeS), pacbporf in inwpCBG.pacbps.iteritems():
            orgQ = inwpCBG.organism_by_node(nodeQ)
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_edge(orgQ, orgS, wt=pacbporf.bitscore)

            # make artificially missed edges between the informants
            for org in inwpCBG.organism_set():
                if org not in [orgQ, orgS] and org in GENE_IDENTIFIER_SET:
                    if gtg.has_edge( orgS, org ) and\
                    gtg.weights[(orgS, org)] > pacbporf.bitscore:
                        gtg.set_edge_weight(orgS, org, wt=pacbporf.bitscore)
                    else:
                        gtg.add_edge(orgS, org, wt=pacbporf.bitscore)

        # omit (nearly) empty genetreegraphs
        if gtg.node_count() <= 1: continue

        # remove (much) weaker connected nodes as expected from the gtg
        while gtg.get_nodes() and MIN_OBSERVED_VS_EXPECTED_RATIO >\
        min( [ gtg.get_node_weighted_connectivity_observed_vs_expected(node) for node in gtg.get_nodes() ]):
            node = gtg.weakest_connected_node()
            gtg.del_node(node)

        # check if already tested before; present in observed_organism_subcombis
        if gtg.get_ordered_nodes() in observed_organism_subcombis: continue

        # store to already tested organism subcombinations
        observed_organism_subcombis.append(gtg.get_ordered_nodes())

        # create a subPCG of these organisms
        subPCG = PacbpCollectionGraph(crossdata={},
                                      blastmatrix=PCG._blastmatrix)
        for (pacbpkey, nodeQ, nodeS), pacbporf in PCG.pacbps.iteritems():
            (orgQ, orfQid), (orgS, orfSid) = nodeQ, nodeS
            if orgQ not in gtg.get_nodes(): continue
            if orgS not in gtg.get_nodes(): continue
            subPCG.add_node(nodeQ)
            subPCG.add_node(nodeS)
            subPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
            subPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf

        # make inwpCBGs of this subPCG
        subinwpcbgs = PCG2inwpCBGS(subPCG)

        # check if there are subinwpcbgs
        if not subinwpcbgs: continue

        ########################################################################
        #if verbose:
        #    print "subPCG organism set:", gtg.get_ordered_nodes()
        #    print_inwpcbgstructure(subinwpcbgs,gtg.get_ordered_nodes())
        ########################################################################

        # create a subInwardsPointingCodingBlockGraph of these organisms
        #subinwpCBG = InwardsPointingCodingBlockGraph()
        #for (pacbpkey,nodeQ,nodeS), pacbporf in inwpCBG.pacbps.iteritems():
        #    (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS
        #    if orgQ not in gtg.get_nodes(): continue
        #    if orgS not in gtg.get_nodes(): continue
        #    subinwpCBG.add_node(nodeQ)
        #    subinwpCBG.add_node(nodeS)
        #    subinwpCBG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        #    subinwpCBG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf

        # detect syntenic genes in this subinwpcbgs
        syntenic_inwpcbgs = assign_syntenic_inwpcbgs(subinwpcbgs)

        for syntinwpcbg in syntenic_inwpcbgs:
            syntenic_subinwpcbgs.append(syntinwpcbg)
            ####################################################################
            if verbose:
                print "SYNTENIC!!", syntinwpcbg, syntinwpcbg.get_ordered_nodes(
                )
                for subCBG in subinwpcbgs:
                    print "syntenic in:", subCBG, subCBG.get_ordered_nodes()
            ####################################################################

    if not syntenic_subinwpcbgs:
        return False

    # cleanup all inwpCBGs from the syntenic subInwpCBGs
    syntenic_pacbpkeys = []
    for syntinwpcbg in syntenic_subinwpcbgs:
        node_set = syntinwpcbg.node_set()
        for inwpCBG in inwpcbgs:
            if not node_set.difference(inwpCBG.node_set()):
                for pacbpkey in inwpCBG.pacbps.keys():
                    if pacbpkey not in syntenic_pacbpkeys:
                        syntenic_pacbpkeys.append(pacbpkey)

    # place all syntenic_pacbpkeys and PacbPORFs in the syntenicPCG
    # and, at the same time, remove from the main PCG
    syntenicPCG = PacbpCollectionGraph(crossdata={},
                                       blastmatrix=PCG._blastmatrix)
    for key in syntenic_pacbpkeys:
        (pacbpkey, nodeQ, nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to syntenicPCG
        syntenicPCG.add_node(nodeQ)
        syntenicPCG.add_node(nodeS)
        syntenicPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
        syntenicPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf
        # remove from main PCG
        _delete_pacbp(PCG, key)

    # return syntenicPCG
    return syntenicPCG
def detect_and_remove_single_nonfinal_inwpcbg(inwpcbgs,
                                              PCG,
                                              GENE_IDENTIFIER_SET,
                                              verbose=False):
    """
    Allow deletion of a very shitty, single inwpCBG from the end of the list
    """
    # we need at least 2 inwpCBGs in order to remove one of them
    if len(inwpcbgs) <= 1: return False

    lastInwpCBG = inwpcbgs[-1]
    prevInwpCBG = inwpcbgs[-2]

    lastNodeList = [ lastInwpCBG.get_organism_nodes(org)[0] for org in\
                lastInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]
    prevNodeList = [ prevInwpCBG.get_organism_nodes(org)[0] for org in\
                prevInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]

    # identical nodes -> do not delete. Only go for very obvious things
    if Set(lastNodeList).intersection(prevNodeList): return False

    ntdistdict = prevInwpCBG.nt_spacing_between_codingblocks([lastInwpCBG])
    tcodedistdict = prevInwpCBG.tcode_spacing_between_codingblocks(
        [lastInwpCBG])

    check1 = prevInwpCBG.count_orfs_labeled_as_annotated_exon() >\
             lastInwpCBG.count_orfs_labeled_as_annotated_exon()
    check2 = prevInwpCBG.get_bitscore() > lastInwpCBG.get_bitscore()
    check3 = len(prevNodeList) > len(lastNodeList)
    check4 = float(lastInwpCBG.count_orfs_labeled_as_annotated_exon()) /\
             float(len(GENE_IDENTIFIER_SET)) <= 0.33
    if ntdistdict:
        check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\
             MIN_INTERGENIC_NT_LENGTH
    else:
        check5 = False
    if tcodedistdict:
        check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\
             TCODE_MAX_NONCODING
    else:
        check6 = False
    check7 = prevInwpCBG.get_projected_tailing_stop_aa_difference() <\
             lastInwpCBG.get_projected_tailing_stop_aa_difference()
    check8 = prevInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()<\
             lastInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()

    checklist = [
        check1, check2, check3, check4, check5, check6, check7, check8
    ]

    ############################################################################
    if verbose: print "NonFinal inwpCBG check:", checklist
    ############################################################################

    if checklist.count(False) == 0:
        nonfinalPCG = PacbpCollectionGraph(crossdata={},
                                           blastmatrix=PCG._blastmatrix)
        # place all PacbPORFs in the nonfinalPCG
        for (pacbpkey, nodeQ,
             nodeS), pacbporf in lastInwpCBG.pacbps.iteritems():
            # add to noncodingnongenePCG
            nonfinalPCG.add_node(nodeQ)
            nonfinalPCG.add_node(nodeS)
            nonfinalPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
            nonfinalPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf
            # remove from main PCG
            _delete_pacbp(PCG, (pacbpkey, nodeQ, nodeS))
        # return nonfinalPCG
        return nonfinalPCG
    else:
        return False
def detect_and_remove_utrornonegene_inwpcbgs(inwpcbgs, PCG, verbose=True):
    """ """

    # if empty list or empty PCG provided: return False
    if not inwpcbgs or not PCG or PCG.node_count() == 0: return False

    # MAKE SHURE ALL Orfs HAVE PREDICTED TSS SITES!!
    for inwpCBG in inwpcbgs:
        inwpCBG.scan_orfs_for_pssm_tss(min_pssm_score=TSS_MIN_PSSM_SCORE)

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # detect inwpCBGs which are most likely 5' and 3' non coding or non gene
    ncng_5p_list = assign_utrornongene5p_inwpcbgs(inwpcbgs)
    ncng_3p_list = assign_utrornongene3p_inwpcbgs(inwpcbgs)
    ncng_list = ncng_5p_list
    ncng_list.extend(ncng_3p_list)

    # return False in no inwpcbgs are assigned
    if not ncng_list: return False

    # get list of inwpCBGs that are NON ncng
    correct_inwpcbg_list = []
    check_str_list = []
    for discrinwpCBG in ncng_list:
        check_str_list.append(str(discrinwpCBG))
    for inwpcbg in inwpcbgs:
        if str(inwpcbg) not in check_str_list:
            correct_inwpcbg_list.append(inwpcbg)

    # get all pacbp keys belonging to noncoding / nongene inwpcbgs ONLY
    ncng_pacbpkeys = []
    for ncnginwpCBG in ncng_list:
        for pacbpkey in ncnginwpCBG.pacbps.keys():
            # check if this pacbpkey is occuring in a non-removed inwpCBG
            is_occurring_in_correct_inwpcbg = False
            for inwp in correct_inwpcbg_list:
                if pacbpkey in inwp.pacbps.keys():
                    is_occurring_in_correct_inwpcbg = True
                    break
            # if is_occurring_in_correct_inwpcbg, continue and do not delete
            if is_occurring_in_correct_inwpcbg:
                continue
            # store to gtgdiscrepancy_pacbpkeys when not stored already
            if pacbpkey not in ncng_pacbpkeys:
                ncng_pacbpkeys.append(pacbpkey)

    # place all ncng_pacbpkeys and PacbPORFs in the noncodingnongenePCG
    # and, at the same time, remove from the main PCG
    noncodingnongenePCG = PacbpCollectionGraph(crossdata={},
                                               blastmatrix=PCG._blastmatrix)
    for key in ncng_pacbpkeys:
        (pacbpkey, nodeQ, nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to noncodingnongenePCG
        noncodingnongenePCG.add_node(nodeQ)
        noncodingnongenePCG.add_node(nodeS)
        noncodingnongenePCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
        noncodingnongenePCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf
        # remove from main PCG
        _delete_pacbp(PCG, key)

    # return noncodingnongenePCG
    return noncodingnongenePCG
def detect_and_remove_gtgdiscrepancy(inwpcbgs,
                                     PCG,
                                     GENE_IDENTIFIER_SET,
                                     verbose=True):
    """ """

    # if empty list or empty PCG provided: return False
    if not inwpcbgs or not PCG or PCG.node_count() == 0: return False

    # get target organism identifier
    target = inwpcbgs[0]._get_target_organism()

    # Make *the* GTG of the strongest X informant species
    # X depends on the maximum number of gene informants (GENE_IDENTIFIER_SET);
    # unigene informants are not taken into account here.
    # X is defined here by:
    # -- at least 3 informants (for very small number of informants)
    # -- optimally half of the total numers of informants
    # -- at most 8 informants
    min_gtg_node_count = 3 + 1
    max_gtg_node_count = 8 + 1
    gtg_size = min([(len(GENE_IDENTIFIER_SET) - 1) / 2, max_gtg_node_count])
    gtg_size = max([min_gtg_node_count, gtg_size])

    btGTG = pcg2gtg_by_bitscore(PCG,
                                target,
                                identifier_list=GENE_IDENTIFIER_SET)
    ntGTG = pcg2gtg_by_identity(PCG,
                                target,
                                identifier_list=GENE_IDENTIFIER_SET)

    # TEMP solution because OrganismGraph != OrganismStarGraph
    # make bitscore ordered list of nodes
    bitscore_ordered_nodes = []
    for (tNode, iNode), wt in btGTG.weights.iteritems():
        if tNode == target: bitscore_ordered_nodes.append((wt, iNode))
    bitscore_ordered_nodes.sort()
    #if verbose: print "btGTG::", bitscore_ordered_nodes

    while ntGTG.node_count() > gtg_size:
        # next line causes errors because OrganismGraph != OrganismStarGraph
        # this causes the target node in rare cases to be assigned as the weakest node
        # informant = btGTG.weakest_connected_node()
        (wt, informant) = bitscore_ordered_nodes.pop(0)
        btGTG.del_node(informant)
        ntGTG.del_node(informant)
        if verbose:
            print "btGGT.weakest_connected_node() ==", informant, btGTG.get_ordered_nodes(
            )

    ############################################################################
    if verbose:
        print "ntGTG:", ntGTG.get_ordered_nodes(),
        for node in ntGTG.get_ordered_nodes():
            if node == target: continue
            print "%1.2f" % ntGTG.weights[(target, node)],
        print ""
    ############################################################################

    # detect inwpCBGs which are probably the result of intron alignments
    gtgdiscrepancy_internal_inwpcbg_list = assign_internal_nongene_alignments(
        inwpcbgs, ntGTG)

    # detect inwpCBGs with strong discrepancy to this GTG
    gtgdiscrepancy_inwpcbg_list = assign_gtgdiscrepancy_inwpcbgs(
        inwpcbgs, ntGTG)

    # merge both lists
    if gtgdiscrepancy_internal_inwpcbg_list:
        if not gtgdiscrepancy_inwpcbg_list:
            gtgdiscrepancy_inwpcbg_list.extend(
                gtgdiscrepancy_internal_inwpcbg_list)
        else:
            for inwpcbg in gtgdiscrepancy_internal_inwpcbg_list:
                check_str = str(inwpcbg)
                if check_str not in [
                        str(gtgdiscrCBG)
                        for gtgdiscrCBG in gtgdiscrepancy_inwpcbg_list
                ]:
                    gtgdiscrepancy_inwpcbg_list.append(inwpcbg)

    if not gtgdiscrepancy_inwpcbg_list:
        return False

    # get list of inwpCBGs that have NO discrepancy
    correct_inwpcbg_list = []
    check_str_list = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        check_str_list.append(str(discrinwpCBG))
    for inwpcbg in inwpcbgs:
        if str(inwpcbg) not in check_str_list:
            correct_inwpcbg_list.append(inwpcbg)

    # get all pacbp keys belonging to gtgdiscrepancy inwpcbgs ONLY
    gtgdiscrepancy_pacbpkeys = []
    for discrinwpCBG in gtgdiscrepancy_inwpcbg_list:
        for pacbpkey in discrinwpCBG.pacbps.keys():
            # check if this pacbpkey is occuring in a non-removed inwpCBG
            is_occurring_in_correct_inwpcbg = False
            for inwp in correct_inwpcbg_list:
                if pacbpkey in inwp.pacbps.keys():
                    is_occurring_in_correct_inwpcbg = True
                    break
            # if is_occurring_in_correct_inwpcbg, continue and do not delete
            if is_occurring_in_correct_inwpcbg:
                continue
            # store to gtgdiscrepancy_pacbpkeys when not stored already
            if pacbpkey not in gtgdiscrepancy_pacbpkeys:
                gtgdiscrepancy_pacbpkeys.append(pacbpkey)

    # place all gtgdiscrepancy_pacbpkeys and PacbPORFs in the gtgdiscrepancyPCG
    # and, at the same time, remove from the main PCG
    gtgdiscrepancyPCG = PacbpCollectionGraph(crossdata={},
                                             blastmatrix=PCG._blastmatrix)
    for key in gtgdiscrepancy_pacbpkeys:
        if key not in PCG.pacbps.keys():
            # !?!? TODO why not present in the PCG !?!?!
            # anyway, continue here to avoid KeyError
            # This PacbPORF was to be deleted rigth here,
            # so it is not an extreme disaster. But... scary ;-)
            continue
        (pacbpkey, nodeQ, nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to gtgdiscrepancyPCG
        gtgdiscrepancyPCG.add_node(nodeQ)
        gtgdiscrepancyPCG.add_node(nodeS)
        gtgdiscrepancyPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
        gtgdiscrepancyPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf

        # remove from main PCG
        _delete_pacbp(PCG, key)

    # return gtgdiscrepancyPCG
    return gtgdiscrepancyPCG
def detect_and_remove_single_nonfirst_inwpcbg(inwpcbgs,
                                              PCG,
                                              GENE_IDENTIFIER_SET,
                                              verbose=False):
    """
    Allow deletion of a very shitty, single inwpCBG from the start of the list
    """
    # we need at least 2 inwpCBGs in order to remove one of them
    if len(inwpcbgs) <= 1: return False

    firstInwpCBG = inwpcbgs[0]
    nextInwpCBG = inwpcbgs[1]

    firstNodeList = [ firstInwpCBG.get_organism_nodes(org)[0] for org in\
                firstInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]
    nextNodeList = [ nextInwpCBG.get_organism_nodes(org)[0] for org in\
                nextInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ]

    # identical nodes -> do not delete. Only go for very obvious things
    if Set(firstNodeList).intersection(nextNodeList): return False

    ntdistdict = firstInwpCBG.nt_spacing_between_codingblocks([nextInwpCBG])
    tcodedistdict = firstInwpCBG.tcode_spacing_between_codingblocks(
        [nextInwpCBG])

    # make a long list of checks which should be True in case
    # firstInwpCBG is *NOT* the first exon of this gene structure
    check1 = nextInwpCBG.count_orfs_labeled_as_annotated_exon() >\
             firstInwpCBG.count_orfs_labeled_as_annotated_exon()
    check2 = nextInwpCBG.get_bitscore() > firstInwpCBG.get_bitscore()
    check3 = len(nextNodeList) > len(firstNodeList)
    check4 = float(firstInwpCBG.count_orfs_labeled_as_annotated_exon()) /\
             float(len(GENE_IDENTIFIER_SET)) <= 0.33
    if ntdistdict:
        check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\
             MIN_INTERGENIC_NT_LENGTH
    else:
        check5 = False
    if tcodedistdict:
        check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\
             TCODE_MAX_NONCODING
    else:
        check6 = False
    check7 = nextInwpCBG.count_orfs_labeled_as_first_exon() >=\
             firstInwpCBG.count_orfs_labeled_as_first_exon()
    check8 = firstInwpCBG.count_orfs_labeled_as_annotated_exon() == 0
    check9 = nextInwpCBG.get_average_upstream_methionine_pssm_score() >\
             firstInwpCBG.get_average_upstream_methionine_pssm_score()

    checklist = [
        check1, check2, check3, check4, check5, check6, check7, check8, check9
    ]

    ############################################################################
    if verbose or True: print "NonFirst inwpCBG check:", checklist
    ############################################################################

    if checklist.count(False) <= 1:
        nonfirstPCG = PacbpCollectionGraph(crossdata={},
                                           blastmatrix=PCG._blastmatrix)
        # place all PacbPORFs in the nonfirstPCG
        for (pacbpkey, nodeQ,
             nodeS), pacbporf in firstInwpCBG.pacbps.iteritems():
            # add to noncodingnongenePCG
            nonfirstPCG.add_node(nodeQ)
            nonfirstPCG.add_node(nodeS)
            nonfirstPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
            nonfirstPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf
            # remove from main PCG
            _delete_pacbp(PCG, (pacbpkey, nodeQ, nodeS))
        # return nonfirstPCG
        return nonfirstPCG
    else:
        return False
示例#13
0
def blastanalysescbgjunction(
        gsg,
        prevCBG,
        nextCBG,
        omit_cbg_orfs=False,
        omit_non_cbg_orfs=False,
        extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
        omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
        verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org, orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf
        for org, orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,
            prevCBG,
            nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
        ).split("\n"))

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org, orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org, orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
                del (fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del (fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):", len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return []

    # check if all organisms are still covered
    orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return []

    # create !single! fasta database
    fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa"
    writeMultiFasta(fastadbmfa, fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ, orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ + "_orf_" + str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       protseq,
                                       fastadbname,
                                       extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS, _orfSid = alignment.title.replace(">",
                                                         "").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS, int(_orfSid))
                orfS = orfs[nodeS]

                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return []

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print[p.bitscore for p in dpcpacbpcol.pacbps.values()]
        print "PCG nodes:", dpcpacbpcol.get_ordered_nodes()
    ################################################################

    #### do some transformations on the pacbpcol
    ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1)
    ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
    ####        edges=gsg.node_count()-1 , max_missing_edges=0 )
    ##### convert to list of CBGs and do some transformations
    ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    ####cbgList.remove_all_but_complete_cbgs()
    ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    ####cbgList.remove_cbgs_without_omsr()
    ####cbgList.update_edge_weights_by_minimal_spanning_range()
    ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2])
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity)
    max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.make_pacbps_for_missing_edges()
    cbgList.remove_all_but_complete_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    # and create_cache() for these CBGs
    for cbg in cbgList:
        cbg.create_cache()

    ####################################################################
    if verbose:
        print stw.lap(), "CBGs created", len(cbgList)
        for newcbg in cbgList:
            print "new:", newcbg
    ####################################################################

    # return list with CBGs
    return cbgList.codingblockgraphs
示例#14
0
def get_reverse_cbg(cbg, frame, verbose=False):
    """
    Get the ReversecomplementCodingBlockGraph in requested frame of this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to reversecomplement

    @type  frame: integer
    @param frame: 0,1 or 2

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  ReversecomplementCodingBlockGraph or None
    @return: ReversecomplementCodingBlockGraph (when existing) or None
    """
    min_orf_length = (cbg.omsrlength() / 2) * 3
    orfs = get_reverse_strand_orfsets(cbg,
                                      frame,
                                      min_orf_length=min_orf_length)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        fname = "%s_reversecbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[len(o.protein_sequence) for o in orfs[org].orfs]
        ########################################################################

    revpacbps = {}
    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        revpacbporfs = {}
        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfS = orfs[orgS].get_orf_by_id(
                    alignment.title.replace(">", ""))
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]
                # skip if hsp is very short
                if len(hsp.query) < cbg.omsrlength() / 2: continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                    ###pacbporf.print_protein_and_dna()
                ################################################################

                nodeQ = (orgQ, orfQ.protein_startPY)
                nodeS = (orgS, orfS.protein_startPY)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_complete_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    ############################################################################
    if verbose:
        for revcbg in cbgList:
            print "revCBG:", revcbg
    ############################################################################

    if not cbgList:
        # no CBG on the reverse strand
        return None
    else:
        # return the highest scoring CBG as a ReversecomlementCodingBlockGraph
        return CodingBlockGraph2ReversecomlementCodingBlockGraph(
            cbgList.codingblockgraphs[0])
示例#15
0
def blastanalysescbgjunction(gsg,prevCBG,nextCBG,
    omit_cbg_orfs = False,
    omit_non_cbg_orfs = False,
    extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
    omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
    verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org,orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf
        for org,orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,prevCBG,nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
            ).split("\n")
        )

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):",len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org,orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org,orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
               del(fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del(fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):",len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return [] 

    # check if all organisms are still covered
    orgSet = Set([ k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return [] 

    # create !single! fasta database
    fastadbname = prevCBG.barcode()+"_"+nextCBG.barcode()+".mfa"
    writeMultiFasta(fastadbmfa,fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ,orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ,orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ+"_orf_"+str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ+"_orf_"+str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,protseq,fastadbname,
                    extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS,_orfSid = alignment.title.replace(">","").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS,int(_orfSid))
                orfS  = orfs[nodeS]
               
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = ( orgQ, orfQ.id )
                nodeS = ( orgS, orfS.id )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return [] 

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print [ p.bitscore for p in dpcpacbpcol.pacbps.values() ]
示例#16
0
def detect_and_remove_synteny(inwpcbgs,PCG,GENE_IDENTIFIER_SET,verbose=True):
    """ """
    MIN_OBSERVED_VS_EXPECTED_RATIO = 0.20

    observed_organism_subcombis = []
    syntenic_subinwpcbgs = []

    # detect syntenic genes in MAIN inwpCBGs,
    # without taking strongest informants by GTG analyses
    syntenic_inwpcbgs = assign_syntenic_inwpcbgs(inwpcbgs)

    for syntinwpcbg in syntenic_inwpcbgs:
        syntenic_subinwpcbgs.append(syntinwpcbg)

    for inwpCBG in inwpcbgs:
        # omit inwpCBGs with annotated exons/orfs
        if inwpCBG.count_orfs_labeled_as_annotated_exon() >= 2: continue
        target = inwpCBG._get_target_organism()

        # make a (artificially fully connected) GeneTreeGraph
        gtg = GeneTreeGraph()
        gtg.add_node(target)
        for (pacbpkey,nodeQ,nodeS),pacbporf in inwpCBG.pacbps.iteritems():
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_node(orgS)
        for (pacbpkey,nodeQ,nodeS),pacbporf in inwpCBG.pacbps.iteritems():
            orgQ = inwpCBG.organism_by_node(nodeQ)
            orgS = inwpCBG.organism_by_node(nodeS)
            if orgS not in GENE_IDENTIFIER_SET: continue
            gtg.add_edge( orgQ, orgS, wt = pacbporf.bitscore )
    
            # make artificially missed edges between the informants
            for org in inwpCBG.organism_set():
                if org not in [orgQ,orgS] and org in GENE_IDENTIFIER_SET:
                    if gtg.has_edge( orgS, org ) and\
                    gtg.weights[(orgS, org)] > pacbporf.bitscore:
                        gtg.set_edge_weight(orgS,org,wt = pacbporf.bitscore)
                    else:
                        gtg.add_edge( orgS, org, wt = pacbporf.bitscore )
    
        # omit (nearly) empty genetreegraphs
        if gtg.node_count() <= 1: continue

        # remove (much) weaker connected nodes as expected from the gtg
        while gtg.get_nodes() and MIN_OBSERVED_VS_EXPECTED_RATIO >\
        min( [ gtg.get_node_weighted_connectivity_observed_vs_expected(node) for node in gtg.get_nodes() ]):
            node = gtg.weakest_connected_node()
            gtg.del_node(node)
    
        # check if already tested before; present in observed_organism_subcombis
        if gtg.get_ordered_nodes() in observed_organism_subcombis: continue
    
        # store to already tested organism subcombinations
        observed_organism_subcombis.append( gtg.get_ordered_nodes() )
    
        # create a subPCG of these organisms
        subPCG = PacbpCollectionGraph(crossdata={},
                    blastmatrix=PCG._blastmatrix)
        for (pacbpkey,nodeQ,nodeS), pacbporf in PCG.pacbps.iteritems():
            (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS
            if orgQ not in gtg.get_nodes(): continue
            if orgS not in gtg.get_nodes(): continue
            subPCG.add_node(nodeQ)
            subPCG.add_node(nodeS)
            subPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
            subPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
    
        # make inwpCBGs of this subPCG
        subinwpcbgs = PCG2inwpCBGS(subPCG)

        # check if there are subinwpcbgs
        if not subinwpcbgs: continue

        ########################################################################
        #if verbose:
        #    print "subPCG organism set:", gtg.get_ordered_nodes()
        #    print_inwpcbgstructure(subinwpcbgs,gtg.get_ordered_nodes())
        ########################################################################
    
        # create a subInwardsPointingCodingBlockGraph of these organisms
        #subinwpCBG = InwardsPointingCodingBlockGraph()
        #for (pacbpkey,nodeQ,nodeS), pacbporf in inwpCBG.pacbps.iteritems():
        #    (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS
        #    if orgQ not in gtg.get_nodes(): continue
        #    if orgS not in gtg.get_nodes(): continue
        #    subinwpCBG.add_node(nodeQ)
        #    subinwpCBG.add_node(nodeS)
        #    subinwpCBG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        #    subinwpCBG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf

        # detect syntenic genes in this subinwpcbgs
        syntenic_inwpcbgs = assign_syntenic_inwpcbgs(subinwpcbgs)

        for syntinwpcbg in syntenic_inwpcbgs:
            syntenic_subinwpcbgs.append(syntinwpcbg)
            ####################################################################
            if verbose:
                print "SYNTENIC!!", syntinwpcbg, syntinwpcbg.get_ordered_nodes()
                for subCBG in subinwpcbgs:
                    print "syntenic in:", subCBG, subCBG.get_ordered_nodes()
            ####################################################################

    if not syntenic_subinwpcbgs:
        return False

    # cleanup all inwpCBGs from the syntenic subInwpCBGs
    syntenic_pacbpkeys = []
    for syntinwpcbg in syntenic_subinwpcbgs:
        node_set = syntinwpcbg.node_set()
        for inwpCBG in inwpcbgs:
            if not node_set.difference(inwpCBG.node_set()):
                for pacbpkey in inwpCBG.pacbps.keys():
                    if pacbpkey not in syntenic_pacbpkeys:
                        syntenic_pacbpkeys.append(pacbpkey)

    # place all syntenic_pacbpkeys and PacbPORFs in the syntenicPCG
    # and, at the same time, remove from the main PCG
    syntenicPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix)
    for key in syntenic_pacbpkeys:
        (pacbpkey,nodeQ,nodeS) = key
        pacbporf = PCG.pacbps[key]
        # add to syntenicPCG
        syntenicPCG.add_node(nodeQ)
        syntenicPCG.add_node(nodeS)
        syntenicPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
        syntenicPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf
        # remove from main PCG
        _delete_pacbp(PCG,key)

    # return syntenicPCG
    return syntenicPCG