예제 #1
0
파일: frameshift.py 프로젝트: IanReid/ABFGP
def get_frameshifted_cbg(cbg, input, verbose=True):
    """
    Get a CBG with frameshifts (in some of if Orfs) compared to this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to check for frameshifts

    @type  input: dict
    @param input: input <dict data structure> with lists of Orfs

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  CodingBlockGraph or None
    @return: CodingBlockGraph (when existing) or None
    """

    # get elegiable lists of Orfs
    orfs = _get_elegiable_frameshift_orfsets(cbg, input)

    # check how many Orfs are elgiable...
    if sum([len(l.orfs) for l in orfs.values()]) == cbg.node_count():
        # no frameshift possible here...
        return None

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        # REMAP fastaheaders as ids to retrieve the Orfs after blast..
        for orf in orfs[org].orfs:
            orf.fastaheader = str(orf.id)
        fname = "%s_frameshiftcbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[orf.id for orf in orfs[org].orfs],
            print[str(orf) for orf in orfs[org].orfs]
        ########################################################################

    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfid = alignment.title.replace(">", "").split(" ")[0].replace(
                    "_", "")
                orfS = orfs[orgS].get_orf_by_id(int(orfid))

                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                if nodeQ in cbg.get_nodes() and nodeS in cbg.get_nodes():
                    pacbporf = cbg.get_pacbps_by_nodes(node1=nodeQ,
                                                       node2=nodeS)[0]

                else:
                    # take only the *best* HSP (highest scoring first one)
                    hsp = alignment.hsps[0]

                    # correct to absolute positions
                    hsp.query_start = hsp.query_start + orfQ.protein_startPY
                    hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                    # initialize the PacbP
                    pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)
                    ############################################################
                    if verbose: print "NEW:", pacbporf
                    ############################################################

                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        ############################################################
        if verbose: print "org_set_size() PCG < CBG"
        ############################################################
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(cbg.node_count())
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_graphlist_by_total_weight_and_identity()

    ############################################################################
    if verbose:
        print "FScbgs (%s)" % len(cbgList)
        for fscbg in cbgList:
            print fscbg
    ############################################################################

    if not cbgList:
        # no (better) frameshifted CBG
        return None
    elif cbgList and not cbgList[0].node_set().symmetric_difference(
            cbg.node_set()):
        # best CBG is not frameshifted, but CBG itself
        return None
    else:
        # score the difference between the frameshifted and current CBG
        score_cbg = cbg.total_weight() * cbg.omsr_identityscore()
        score_fscbg = cbgList[0].total_weight(
        ) * cbgList[0].omsr_identityscore()
        # check overlap between the frameshifted and current CBG
        a, b, c, d, e, f, g = relatively_positioned_towards(cbgList[0], cbg)

        ########################################################################
        if verbose:
            print "CBG", cbg
            cbg.printmultiplealignment()
            for fscbg in cbgList:
                print "fsCBG:", fscbg
                fscbg.printmultiplealignment()
        ########################################################################

        if (c, d) == ((0, 0, 1), (1, 0, 0)) or (c, d) == ((0, 0, 1),
                                                          (1, 0, 0)):
            # CBG and frameshifted CBG do not share a single AA overlap...
            # This does not represent a frameshifted CBG as we searched for
            return False
        elif score_fscbg > score_cbg:
            # return the highest scoring, frameshifted CBG
            return cbgList[0]
        else:
            # no, still not convinced that this is a frameshifted CBG
            return False
예제 #2
0
def get_reverse_cbg(cbg,frame,verbose=False):
    """
    Get the ReversecomplementCodingBlockGraph in requested frame of this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to reversecomplement

    @type  frame: integer
    @param frame: 0,1 or 2

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  ReversecomplementCodingBlockGraph or None
    @return: ReversecomplementCodingBlockGraph (when existing) or None
    """
    min_orf_length = (cbg.omsrlength()/2)*3
    orfs = get_reverse_strand_orfsets(cbg,frame,min_orf_length=min_orf_length)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        fname = "%s_reversecbg_%s.mfa" % (org,cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(),fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print [len(o.protein_sequence) for o in orfs[org].orfs ]
        ########################################################################

    revpacbps = {}
    for orgQ,orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        revpacbporfs = {}
        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,orfQ.protein_sequence,
                        dbname="./"+blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfS = orfs[orgS].get_orf_by_id(alignment.title.replace(">",""))
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]
                # skip if hsp is very short
                if len(hsp.query) < cbg.omsrlength()/2: continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                    ###pacbporf.print_protein_and_dna()
                ################################################################

                nodeQ = ( orgQ, orfQ.protein_startPY )
                nodeS = ( orgS, orfS.protein_startPY )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print [ pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values() ]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count()-1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
            edges=cbg.node_count()-1 , max_missing_edges=0 )
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    cbgList.remove_all_but_complete_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    ############################################################################
    if verbose:
        for revcbg in cbgList:
            print "revCBG:", revcbg
    ############################################################################

    if not cbgList:
        # no CBG on the reverse strand
        return None
    else:
        # return the highest scoring CBG as a ReversecomlementCodingBlockGraph
        return CodingBlockGraph2ReversecomlementCodingBlockGraph(
                cbgList.codingblockgraphs[0])
예제 #3
0
def find_intermediary_codingblockgraph_with_tinyexon(graphL,graphR,input={},similaritymatrix=None,min_bitscore_ratio=0.3):
    """
    """
    tinyexon_crossdata = {}
    tinyexons_seen = 0

    for org in graphL.organism_set():
        theOrfL = graphL.get_orfs_of_graph(organism=org)[0]
        theOrfR = graphR.get_orfs_of_graph(organism=org)[0]
        # continue if identical orfs
        # TODO: maybe check as well for spanning ranges?
        # TODO: in theory, a tinyorf can exist on this orf as well...
        if theOrfL.id == theOrfR.id: continue
        msrL = graphL.minimal_spanning_range(organism=org)
        msrR = graphR.minimal_spanning_range(organism=org)

        # check for get eligable donors on orfL and acceptors on orfR
        if org in graphL._splicedonorgraph.organism_set() and\
        org in graphR._spliceacceptorgraph.organism_set():
            eligable_donors    = graphL._splicedonorgraph.get_organism_objects(org)
            eligable_acceptors = graphR._spliceacceptorgraph.get_organism_objects(org)
            orflist            = input[org]['orfs'].orfs

            # search for tinyexons
            tinyexonlist = bridge_two_pacbporfs_by_tinyexon(theOrfL,theOrfR,
                    preceding_donor_sites= eligable_donors,
                    subsequent_acceptor_sites= eligable_acceptors,
                    orflist=orflist
                    )

            doubletinyexons = bridge_two_pacbporfs_by_two_tinyexons(theOrfL,theOrfR,
                    preceding_donor_sites= eligable_donors,
                    subsequent_acceptor_sites= eligable_acceptors,
                    orflist=orflist
                    )

        else:
            # not donors and acceptors on both orfs!
            return []


        # Order the tinyexons with respect to which orf they are located on.
        # for now, IGNORE tinyexons on the both left and right orf it self!!
        orf2tinyexons = {}
        for tinyexon in tinyexonlist:
            if tinyexon.orf.id in [ theOrfL.id, theOrfR.id ]:
                continue
            if orf2tinyexons.has_key(tinyexon.orf.id):
                orf2tinyexons[tinyexon.orf.id].append(tinyexon)
            else:
                orf2tinyexons[tinyexon.orf.id] = [ tinyexon ]

        # loop over the unique orfids on which tinyexons are predicted
        for orfid, telist in orf2tinyexons.iteritems():
            # loop over all other organisms (except the organism itself)
            for otherorg in graphL.organism_set():
                if otherorg == org: continue
                orgkey = [org,otherorg]
                orgkey.sort()
                _orgkey_reversed = False
                if orgkey != [org,otherorg]: _orgkey_reversed = True
                orgkey = tuple(orgkey)
                if not tinyexon_crossdata.has_key(orgkey):
                    tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} }
                orfL = graphL.get_orfs_of_graph(organism=otherorg)[0]
                orfR = graphR.get_orfs_of_graph(organism=otherorg)[0]

                # main list for all similarities on this orfid
                similaritiesL = []
                similaritiesR = []
                for tinyexon in telist:
                    # get protein query sequence from tinyorf
                    query_dna    = tinyexon.orf.inputgenomicsequence[tinyexon.acceptor.pos:tinyexon.donor.pos]
                    query        = dna2proteinbyframe(query_dna, (3 - tinyexon.acceptor.phase) % 3 )
                    query_aa_pos = tinyexon.acceptor.pos / 3

                    _similaritiesL = similaritymatrix.scansbjct(query,orfL.protein_sequence,min_bitscore_ratio=min_bitscore_ratio)
                    if orfL.id == orfR.id:
                        _similaritiesR = []
                    else:
                        _similaritiesR = similaritymatrix.scansbjct(query,orfR.protein_sequence,min_bitscore_ratio=min_bitscore_ratio)
                    # Append to all similarities on this orfid; append the tinyexon itself too
                    # in order to place the similarity back to a specific tinyexon.
                    # This is needed because there can be >1 tinyexon on the same orf...
                    _similaritiesL = [ (_data,tinyexon) for _data in _similaritiesL ]
                    _similaritiesR = [ (_data,tinyexon) for _data in _similaritiesR ]
                    similaritiesL.extend(_similaritiesL)
                    similaritiesR.extend(_similaritiesR)

                # re-order the similarities because they can contain data from 2 tinyexons (on the same orf)
                # ordering is performed on ``ratio * bitscore``
                # this - kind of - evalue calculation enables a preferation for longer matches
                similaritiesL = _order_similarities(similaritiesL)
                similaritiesR = _order_similarities(similaritiesR)

                # Now make pacbporfs of only the BEST tinyexon and its
                # similarity on another organism
                TAKE_BEST_SIMILARITIES = 2
                for ( ( ratio, sbjct_pos, q_seq, match, s_seq, bitscore), tinyexon ) in similaritiesL[0:TAKE_BEST_SIMILARITIES]:
                    sbjct_aa_pos = sbjct_pos+orfL.protein_startPY
                    query_aa_pos = tinyexon.acceptor.pos / 3
                    if _orgkey_reversed:
                        ###print s_seq, "'%s'" % match, ratio, orfL.id
                        pacbpkey = (bitscore, len(query), orfL.id, tinyexon.orf.id )
                        pacbp    = pacb.PacbP(input=(s_seq,q_seq,sbjct_aa_pos,query_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,orfL,tinyexon.orf)
                    else:
                        ###print q_seq, "'%s'" % match, ratio, orfL.id
                        pacbpkey = (bitscore, len(query), tinyexon.orf.id, orfL.id )
                        pacbp    = pacb.PacbP(input=(q_seq,s_seq,query_aa_pos,sbjct_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,tinyexon.orf,orfL)

                    tinyexons_seen+=1
                    pacbporf.extend_pacbporf_after_stops()
                    tinyexon_crossdata[orgkey]['accepted_pacbs'][pacbpkey] = pacbporf

                for ( ( ratio, sbjct_pos, q_seq, match, s_seq, bitscore), tinyexon ) in similaritiesR[0:TAKE_BEST_SIMILARITIES]:
                    sbjct_aa_pos = sbjct_pos+orfR.protein_startPY
                    query_aa_pos = tinyexon.acceptor.pos / 3
                    if _orgkey_reversed:
                        pacbpkey = (bitscore, len(query), orfR.id, tinyexon.orf.id )
                        pacbp    = pacb.PacbP(input=(s_seq,q_seq,sbjct_aa_pos,query_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,orfR,tinyexon.orf)
                    else:
                        pacbpkey = (bitscore, len(query), tinyexon.orf.id, orfR.id )
                        pacbp    = pacb.PacbP(input=(q_seq,s_seq,query_aa_pos,sbjct_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,tinyexon.orf,orfR)

                    tinyexons_seen+=1
                    pacbporf.extend_pacbporf_after_stops()
                    tinyexon_crossdata[orgkey]['accepted_pacbs'][pacbpkey] = pacbporf


    if not tinyexons_seen:
        return []
    else:
        # add the nodes/edges from the input graphs as well
        for ( (a,b,c,d),n1,n2 ) in graphL.pacbps.keys():
            orgkey   = ( n1[0], n2[0] )
            pacbpdna = graphL.pacbps[( (a,b,c,d),n1,n2 )]
            if not tinyexon_crossdata.has_key(orgkey):
                tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} }
            tinyexon_crossdata[orgkey]['accepted_pacbs'][(a,b,c,d)] = pacbpdna

        for ( (a,b,c,d),n1,n2 ) in graphR.pacbps.keys():
            orgkey   = ( n1[0], n2[0] )
            pacbpdna = graphR.pacbps[( (a,b,c,d),n1,n2 )]
            if not tinyexon_crossdata.has_key(orgkey):
                tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} }
            tinyexon_crossdata[orgkey]['accepted_pacbs'][(a,b,c,d)] = pacbpdna
    

    # make graph, remove to low connected nodes and split in complete graphs
    tinyexonsg = create_pacbpcollectiongraph_from_crossdata(tinyexon_crossdata)
    tinyexonsg.remove_low_connectivity_nodes(min_connectivity=2)
    splitted_tinyexongraphs = tinyexonsg.find_fully_connected_subgraphs(
                edges=4,
                max_missing_edges=0 )

    # now remove the graphs that are graphL and graphR ;-)
    graphLnodes = graphL.get_nodes()
    graphLnodes.sort()
    graphRnodes = graphR.get_nodes()
    graphRnodes.sort()
    for pos in range(0,len(splitted_tinyexongraphs)):
        tegNodes = splitted_tinyexongraphs[pos].get_nodes()
        tegNodes.sort()
        if tegNodes == graphLnodes:
            splitted_tinyexongraphs.pop(pos)
            break
    for pos in range(0,len(splitted_tinyexongraphs)):
        tegNodes = splitted_tinyexongraphs[pos].get_nodes()
        tegNodes.sort()
        if tegNodes == graphRnodes:
            splitted_tinyexongraphs.pop(pos)
            break

    # make ListOfCodingBlockGraphs
    cbgList = ListOfCodingBlockGraphs(splitted_tinyexongraphs,
            input=input,
            crossdata=tinyexon_crossdata
            )

    # do all what is needed to create K(s) CBGs of these
    cbgList.harvest_pacbps_from_crossdata()
    cbgList.split_codingblock_on_alternatives_in_pacbps_dict(
            filter_for_msr=True,
            filter_for_omsr=True,
            )
    # remove non-compatible CBGs
    cbgList.remove_incompatible_cbgs(
            minimal_node_count=len(input),
            minimal_edge_count=len(tinyexon_crossdata),
            filter_for_msr=True,
            filter_for_omsr=True
            )

    # get list of accepted TinyExonCbgs 
    accepted_tegs = cbgList.codingblockgraphs

    # and update weights by minimal spanning region
    for teg in accepted_tegs: teg.update_edge_weights_by_minimal_spanning_range()

    # and check if they can be placed IN BETWEEN graphL and graphR
    # TODO some prints
    final_graphs_with_tinyexons = []
    for teg in accepted_tegs:

        test_codingblock_order, rejected_graphs = make_consensus_genestructure_from_compatible_pacb_graphs(
                [graphL,graphR,teg],None)

        print "checking hypo TEG:", teg.get_ordered_nodes(), "of", len(accepted_tegs), "len of join", len(test_codingblock_order)


        #empty_input = {}
        #for org in teg.organism_set(): empty_input[org] = None
        #tmpGSG = GenestructureOfCodingBlockGraphs(empty_input)
        #tmpGSG.add_codingblocks([graphL,graphR,teg])
        #print "tinyexon tmp check:", len(test_codingblock_order), len(tmpGSG), teg.get_ordered_nodes()


        wt_after  = teg.total_weight()
        if len(test_codingblock_order) == 3:
            teg_nodes = teg.get_nodes()
            teg_nodes.sort()
            middle = test_codingblock_order[1]
            middle_nodes = middle.get_nodes()
            middle_nodes.sort()
            if middle_nodes == teg_nodes:
                # yahoo, this one is 100% okay!
                final_graphs_with_tinyexons.append( teg )

    if len(final_graphs_with_tinyexons)==1:
        print final_graphs_with_tinyexons[0].get_nodes()
        return final_graphs_with_tinyexons
    elif len(final_graphs_with_tinyexons)>1:
        print "### WARNING!!!! more than 1 tinyexon graph is found."
        print "### WARNING!!!! however, only single one is returned."
        print "### WARNING!!!! returning >1 can cause errors..."
        return [ final_graphs_with_tinyexons[0] ]
    else:
        return []
예제 #4
0
def blastanalysescbgjunction(
        gsg,
        prevCBG,
        nextCBG,
        omit_cbg_orfs=False,
        omit_non_cbg_orfs=False,
        extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
        omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
        verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org, orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf
        for org, orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,
            prevCBG,
            nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
        ).split("\n"))

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org, orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org, orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
                del (fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del (fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):", len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return []

    # check if all organisms are still covered
    orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return []

    # create !single! fasta database
    fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa"
    writeMultiFasta(fastadbmfa, fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ, orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ + "_orf_" + str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       protseq,
                                       fastadbname,
                                       extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS, _orfSid = alignment.title.replace(">",
                                                         "").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS, int(_orfSid))
                orfS = orfs[nodeS]

                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return []

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print[p.bitscore for p in dpcpacbpcol.pacbps.values()]
        print "PCG nodes:", dpcpacbpcol.get_ordered_nodes()
    ################################################################

    #### do some transformations on the pacbpcol
    ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1)
    ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
    ####        edges=gsg.node_count()-1 , max_missing_edges=0 )
    ##### convert to list of CBGs and do some transformations
    ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    ####cbgList.remove_all_but_complete_cbgs()
    ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    ####cbgList.remove_cbgs_without_omsr()
    ####cbgList.update_edge_weights_by_minimal_spanning_range()
    ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2])
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity)
    max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.make_pacbps_for_missing_edges()
    cbgList.remove_all_but_complete_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    # and create_cache() for these CBGs
    for cbg in cbgList:
        cbg.create_cache()

    ####################################################################
    if verbose:
        print stw.lap(), "CBGs created", len(cbgList)
        for newcbg in cbgList:
            print "new:", newcbg
    ####################################################################

    # return list with CBGs
    return cbgList.codingblockgraphs
예제 #5
0
def get_reverse_cbg(cbg, frame, verbose=False):
    """
    Get the ReversecomplementCodingBlockGraph in requested frame of this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to reversecomplement

    @type  frame: integer
    @param frame: 0,1 or 2

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  ReversecomplementCodingBlockGraph or None
    @return: ReversecomplementCodingBlockGraph (when existing) or None
    """
    min_orf_length = (cbg.omsrlength() / 2) * 3
    orfs = get_reverse_strand_orfsets(cbg,
                                      frame,
                                      min_orf_length=min_orf_length)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        fname = "%s_reversecbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[len(o.protein_sequence) for o in orfs[org].orfs]
        ########################################################################

    revpacbps = {}
    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        revpacbporfs = {}
        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfS = orfs[orgS].get_orf_by_id(
                    alignment.title.replace(">", ""))
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]
                # skip if hsp is very short
                if len(hsp.query) < cbg.omsrlength() / 2: continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                    ###pacbporf.print_protein_and_dna()
                ################################################################

                nodeQ = (orgQ, orfQ.protein_startPY)
                nodeS = (orgS, orfS.protein_startPY)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_complete_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    ############################################################################
    if verbose:
        for revcbg in cbgList:
            print "revCBG:", revcbg
    ############################################################################

    if not cbgList:
        # no CBG on the reverse strand
        return None
    else:
        # return the highest scoring CBG as a ReversecomlementCodingBlockGraph
        return CodingBlockGraph2ReversecomlementCodingBlockGraph(
            cbgList.codingblockgraphs[0])
예제 #6
0
파일: frameshift.py 프로젝트: IanReid/ABFGP
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print [ pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values() ]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count()-1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
            edges=cbg.node_count()-1 , max_missing_edges=0 )
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(cbg.node_count())
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_graphlist_by_total_weight_and_identity()

    ############################################################################
    if verbose:
        print "FScbgs (%s)" % len(cbgList)
        for fscbg in cbgList: print fscbg 
    ############################################################################

    if not cbgList:
        # no (better) frameshifted CBG
예제 #7
0
    ##### convert to list of CBGs and do some transformations
    ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    ####cbgList.remove_all_but_complete_cbgs()
    ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    ####cbgList.remove_cbgs_without_omsr()
    ####cbgList.update_edge_weights_by_minimal_spanning_range()
    ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    min_connectivity = max([1,gsg.EXACT_SG_NODE_COUNT-1-2])
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity)
    max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
            edges=gsg.node_count()-1 , max_missing_edges=max_missing_edges )
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.make_pacbps_for_missing_edges()
    cbgList.remove_all_but_complete_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    # and create_cache() for these CBGs
    for cbg in cbgList: cbg.create_cache()

    ####################################################################
    if verbose:
        print stw.lap(), "CBGs created", len(cbgList)