示例#1
0
def get_reverse_cbg(cbg,frame,verbose=False):
    """
    Get the ReversecomplementCodingBlockGraph in requested frame of this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to reversecomplement

    @type  frame: integer
    @param frame: 0,1 or 2

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  ReversecomplementCodingBlockGraph or None
    @return: ReversecomplementCodingBlockGraph (when existing) or None
    """
    min_orf_length = (cbg.omsrlength()/2)*3
    orfs = get_reverse_strand_orfsets(cbg,frame,min_orf_length=min_orf_length)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        fname = "%s_reversecbg_%s.mfa" % (org,cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(),fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print [len(o.protein_sequence) for o in orfs[org].orfs ]
        ########################################################################

    revpacbps = {}
    for orgQ,orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        revpacbporfs = {}
        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,orfQ.protein_sequence,
                        dbname="./"+blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfS = orfs[orgS].get_orf_by_id(alignment.title.replace(">",""))
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]
                # skip if hsp is very short
                if len(hsp.query) < cbg.omsrlength()/2: continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                    ###pacbporf.print_protein_and_dna()
                ################################################################

                nodeQ = ( orgQ, orfQ.protein_startPY )
                nodeS = ( orgS, orfS.protein_startPY )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print [ pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values() ]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count()-1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
            edges=cbg.node_count()-1 , max_missing_edges=0 )
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    cbgList.remove_all_but_complete_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    ############################################################################
    if verbose:
        for revcbg in cbgList:
            print "revCBG:", revcbg
    ############################################################################

    if not cbgList:
        # no CBG on the reverse strand
        return None
    else:
        # return the highest scoring CBG as a ReversecomlementCodingBlockGraph
        return CodingBlockGraph2ReversecomlementCodingBlockGraph(
                cbgList.codingblockgraphs[0])
示例#2
0
def _blastorfset2blastdb(geneQ,
                         geneS,
                         blastdbfname,
                         input,
                         crossdata,
                         GSgraph,
                         blastoptions=None,
                         elegiable_orfsQ=[],
                         elegiable_orfsS_ids=[],
                         logging=False):
    """
    """
    hitcnt = 0
    for orfQ in elegiable_orfsQ:
        # check if protein sequence present in Orf object
        # in obscure cases of unigenes, no protein sequence present!
        if not orfQ.protein_sequence: continue

        # make unique node identifier and blast header
        nodeQ = (geneQ, orfQ.id)
        header = "%s_orf_%s" % (geneQ, orfQ.id)

        # do the blastp!
        blastrec = blastall_seq2db(
            header,
            orfQ.protein_sequence,
            dbname=blastdbfname,
            extra_blastp_params=blastoptions.extra_blastp_params)

        # check if blast failed (then, blastrec == False)
        if not blastrec: continue

        # check if there are any hits/hsps!
        if len(blastrec.alignments) == 0:
            # no hits; continue
            continue

        for alignment in blastrec.alignments:
            # get back orfpointerB from the SBJCT and create nodeS
            _parts = alignment.title.split("_")
            geneS = "_".join(_parts[0:-2]).replace('>', '')
            _orfpointerS = int(_parts[-1])
            nodeS = (geneS, _orfpointerS)

            # ignore hit if nodeS orfid not occurring in the NON-empty list elegiable_orfsS_ids
            if elegiable_orfsS_ids and _orfpointerS not in elegiable_orfsS_ids:
                continue

            # get the Orf object of this sbjct sequence
            orfS = input[geneS]['orfs'].get_orf_by_id(_orfpointerS)

            # loop over the HSPs
            for hsp in alignment.hsps:

                # If hits are really tiny (happens in case of BLOSUM45 matrix),
                # discard them directly before precious time is lost...
                if len(hsp.query
                       ) <= blastoptions.BLASTP_DIRECTLY_IGNORE_TINY_HITS:
                    continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY
                hsp.query_end = hsp.query_end + orfQ.protein_startPY
                hsp.sbjct_end = hsp.sbjct_end + orfS.protein_startPY

                # VERY exceptional case: HSP starts or ends with a gap
                # I expect this is an error in Blastp ....
                strip_exterior_gaps(hsp)

                if hsp.query.find(" ") > 0:
                    # VERY exceptional case: erroneously NCBI parsed HSP:
                    # Score 8 (7 bits), expectation 1.7e+01, alignment length 41
                    # Query:    1622 STHTYDAC                                                TRCI----PFVDTGHKHENPTEALLDSTA 1654
                    #                TR      P  +  H +  P+++   S++
                    # Sbjct:     489 TEHIYLHT                                                TRSTWPPKPPTNASHANTKPSKSHHRSSS 525
                    if len(hsp.query.split(" ")[-1]) == len(hsp.match):
                        hsp.query = hsp.query.split(" ")[-1]
                        hsp.sbjct = hsp.sbjct.split(" ")[-1]
                    elif len(hsp.query.split(" ")[0]) == len(hsp.match):
                        hsp.query = hsp.query.split(" ")[0]
                        hsp.sbjct = hsp.sbjct.split(" ")[0]
                    elif len(hsp.query) == len(hsp.match):
                        # spaces in both query/match/sbjct
                        while hsp.query.find(" ") > 0:
                            pos = hsp.query.find(" ")
                            if hsp.sbjct[pos] == " " and hsp.match[pos] == " ":
                                hsp.query = hsp.query[0:pos] + hsp.query[pos +
                                                                         1:]
                                hsp.match = hsp.match[0:pos] + hsp.match[pos +
                                                                         1:]
                                hsp.sbjct = hsp.sbjct[0:pos] + hsp.sbjct[pos +
                                                                         1:]
                            else:
                                # HPS is not repairable -> quit trying
                                break
                    elif len(hsp.query) == len(hsp.sbjct):
                        while hsp.query.find(" ") > 0:
                            pos = hsp.query.find(" ")
                            hsp.query = hsp.query[0:pos] + hsp.query[pos + 1:]
                            hsp.sbjct = hsp.sbjct[0:pos] + hsp.sbjct[pos + 1:]
                        # recreate alignment match string is done upon
                        # creation of PacbP object

                    else:
                        pass

                # VERY exceptional case: HSP starts or ends with a gap
                # I expect this is an error in Blastp ....
                strip_exterior_gaps(hsp)
                try:
                    pacbp = pacb.PacbP(blastp_hsp=hsp,
                                       MATRIX=blastoptions.MATRIX)
                except:
                    # VERY exceptional miscelaneous cases: erroneously NCBI parsed HSP:
                    print hsp
                    print "'%s' X" % hsp.query, len(
                        hsp.query), hsp.query_start, hsp.query_end
                    print "'%s' X" % hsp.match, len(hsp.match)
                    print "'%s' X" % hsp.sbjct, len(
                        hsp.sbjct), hsp.sbjct_start, hsp.sbjct_end
                    pacbp = pacb.PacbP(blastp_hsp=hsp,
                                       MATRIX=blastoptions.MATRIX)

                # make pacbp of this hsp
                pacbp = pacb.PacbP(blastp_hsp=hsp, MATRIX=blastoptions.MATRIX)

                # if logging is requested for, print this pacbp to STDOUT
                if logging:
                    print ">>> Q", nodeQ, orfQ.tcode_symbolic(
                    ), "S", nodeS, orfS.tcode_symbolic(
                    ), pacbp, blastoptions.MATRIX.name, hsp.expect, hsp.bits
                    print ">>>", blastoptions.extra_blastp_params
                    if pacbp.length > 100:
                        print pacbp.query[0:40] + '.' * 7 + str(
                            pacbp.length - 80) + '.' * 7 + pacbp.query[-40:]
                        print pacbp.match[0:40] + '.' * 7 + str(
                            pacbp.length - 80) + '.' * 7 + pacbp.match[-40:]
                        print pacbp.sbjct[0:40] + '.' * 7 + str(
                            pacbp.length - 80) + '.' * 7 + pacbp.sbjct[-40:]
                    else:
                        print pacbp.query
                        print pacbp.match
                        print pacbp.sbjct

                # blastoptions.BLASTP_HSP_MINIMAL_LENGTH represents the minimal
                # length of the aligned part. (To) short pacbp's are abandoned
                if pacbp.length < blastoptions.BLASTP_HSP_MINIMAL_LENGTH:
                    if pacbp.identityscore == float(pacbp.length):
                        # escape for 100% identical tiny pacbps
                        pass
                    elif pacbp.identity + pacbp.similarity == pacbp.length:
                        # escape for 100% similar tiny pacbps
                        pass
                    else:
                        #  pacbp is to small. Discard!
                        if logging: print "to small..."
                        continue

                # check if the pacbp is not conflicting with the currect GSG graph
                # if so, ignore now because it will not yield a proper edge in an (accepted) CBG!
                if GSgraph and len(
                        GSgraph
                ) and GSgraph.is_pacbp_conflicting_with_genestructure(
                        pacbp, orgQ=geneQ, orgS=geneS):
                    ###print "GSGconflict!", nodeQ,nodeS, GSgraph.is_pacbp_conflicting_with_genestructure(pacbp,orgQ=geneQ,orgS=geneS), len(pacbp)
                    continue

                # here we have a potentially accepted pacbp.
                # make a/the unique key of this pacbp
                key = (pacbp.bits, pacbp.length, orfQ.id, _orfpointerS)

                # check for evalue criterion
                if (blastoptions.BLASTP_HSP_MAXIMAL_EXPECT
                        or blastoptions.BLASTP_HSP_MAXIMAL_EXPECT == 0.0
                    ) and hsp.expect > blastoptions.BLASTP_HSP_MAXIMAL_EXPECT:
                    # pacbp is long enough but has a to high evalue
                    crossdata[(geneQ, geneS)]['lowscoring_pacbs'][key] = pacbp
                    if logging: print "to low bitscore or expect"
                    continue

                # check for bitscore criterion
                if (blastoptions.BLASTP_HSP_MINIMAL_BITS
                        or blastoptions.BLASTP_HSP_MINIMAL_BITS == 0
                    ) and pacbp.bits < blastoptions.BLASTP_HSP_MINIMAL_BITS:
                    # pacbp is long enough but has a to low bitscore
                    crossdata[(geneQ, geneS)]['lowscoring_pacbs'][key] = pacbp
                    if logging: print "to low bitscore or expect"
                    continue

                # !!Hurray!! an accepted pacbp. Store to crossdata
                # store it to the 'accepted_pacbs' dict of crossdata
                crossdata[(geneQ, geneS)]['accepted_pacbs'][key] = pacbp
                hitcnt += 1
                if logging: print "ACCEPTED"
                # done -> check next orf!

    # return the filled crossdata structure
    return crossdata, hitcnt
示例#3
0
def get_frameshifted_cbg(cbg, input, verbose=True):
    """
    Get a CBG with frameshifts (in some of if Orfs) compared to this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to check for frameshifts

    @type  input: dict
    @param input: input <dict data structure> with lists of Orfs

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  CodingBlockGraph or None
    @return: CodingBlockGraph (when existing) or None
    """

    # get elegiable lists of Orfs
    orfs = _get_elegiable_frameshift_orfsets(cbg, input)

    # check how many Orfs are elgiable...
    if sum([len(l.orfs) for l in orfs.values()]) == cbg.node_count():
        # no frameshift possible here...
        return None

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        # REMAP fastaheaders as ids to retrieve the Orfs after blast..
        for orf in orfs[org].orfs:
            orf.fastaheader = str(orf.id)
        fname = "%s_frameshiftcbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[orf.id for orf in orfs[org].orfs],
            print[str(orf) for orf in orfs[org].orfs]
        ########################################################################

    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfid = alignment.title.replace(">", "").split(" ")[0].replace(
                    "_", "")
                orfS = orfs[orgS].get_orf_by_id(int(orfid))

                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                if nodeQ in cbg.get_nodes() and nodeS in cbg.get_nodes():
                    pacbporf = cbg.get_pacbps_by_nodes(node1=nodeQ,
                                                       node2=nodeS)[0]

                else:
                    # take only the *best* HSP (highest scoring first one)
                    hsp = alignment.hsps[0]

                    # correct to absolute positions
                    hsp.query_start = hsp.query_start + orfQ.protein_startPY
                    hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                    # initialize the PacbP
                    pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)
                    ############################################################
                    if verbose: print "NEW:", pacbporf
                    ############################################################

                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        ############################################################
        if verbose: print "org_set_size() PCG < CBG"
        ############################################################
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(cbg.node_count())
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_graphlist_by_total_weight_and_identity()

    ############################################################################
    if verbose:
        print "FScbgs (%s)" % len(cbgList)
        for fscbg in cbgList:
            print fscbg
    ############################################################################

    if not cbgList:
        # no (better) frameshifted CBG
        return None
    elif cbgList and not cbgList[0].node_set().symmetric_difference(
            cbg.node_set()):
        # best CBG is not frameshifted, but CBG itself
        return None
    else:
        # score the difference between the frameshifted and current CBG
        score_cbg = cbg.total_weight() * cbg.omsr_identityscore()
        score_fscbg = cbgList[0].total_weight(
        ) * cbgList[0].omsr_identityscore()
        # check overlap between the frameshifted and current CBG
        a, b, c, d, e, f, g = relatively_positioned_towards(cbgList[0], cbg)

        ########################################################################
        if verbose:
            print "CBG", cbg
            cbg.printmultiplealignment()
            for fscbg in cbgList:
                print "fsCBG:", fscbg
                fscbg.printmultiplealignment()
        ########################################################################

        if (c, d) == ((0, 0, 1), (1, 0, 0)) or (c, d) == ((0, 0, 1),
                                                          (1, 0, 0)):
            # CBG and frameshifted CBG do not share a single AA overlap...
            # This does not represent a frameshifted CBG as we searched for
            return False
        elif score_fscbg > score_cbg:
            # return the highest scoring, frameshifted CBG
            return cbgList[0]
        else:
            # no, still not convinced that this is a frameshifted CBG
            return False
示例#4
0
def _blastorfset2blastdb(geneQ,geneS,blastdbfname,input,crossdata,GSgraph,
    blastoptions = None,
    elegiable_orfsQ=[],
    elegiable_orfsS_ids=[],
    logging=False):
    """
    """
    hitcnt = 0
    for orfQ in elegiable_orfsQ:
        # check if protein sequence present in Orf object
        # in obscure cases of unigenes, no protein sequence present!
        if not orfQ.protein_sequence: continue

        # make unique node identifier and blast header
        nodeQ = (geneQ,orfQ.id)
        header = "%s_orf_%s" % (geneQ,orfQ.id)

        # do the blastp!
        blastrec = blastall_seq2db(header, orfQ.protein_sequence,
                dbname=blastdbfname,
                extra_blastp_params=blastoptions.extra_blastp_params )

        # check if blast failed (then, blastrec == False)
        if not blastrec: continue
    
        # check if there are any hits/hsps!
        if len(blastrec.alignments) == 0:
            # no hits; continue
            continue
    
        for alignment in blastrec.alignments:
            # get back orfpointerB from the SBJCT and create nodeS
            _parts = alignment.title.split("_")
            geneS = "_".join(_parts[0:-2]).replace('>','')
            _orfpointerS = int(_parts[-1])
            nodeS = (geneS,_orfpointerS)

            # ignore hit if nodeS orfid not occurring in the NON-empty list elegiable_orfsS_ids
            if elegiable_orfsS_ids and _orfpointerS not in elegiable_orfsS_ids:
                continue

            # get the Orf object of this sbjct sequence
            orfS = input[geneS]['orfs'].get_orf_by_id(_orfpointerS)

            # loop over the HSPs
            for hsp in alignment.hsps:

                # If hits are really tiny (happens in case of BLOSUM45 matrix),
                # discard them directly before precious time is lost...
                if len(hsp.query) <= blastoptions.BLASTP_DIRECTLY_IGNORE_TINY_HITS:
                    continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY
                hsp.query_end = hsp.query_end + orfQ.protein_startPY
                hsp.sbjct_end = hsp.sbjct_end + orfS.protein_startPY

                # VERY exceptional case: HSP starts or ends with a gap
                # I expect this is an error in Blastp .... 
                strip_exterior_gaps(hsp)

                if hsp.query.find(" ") > 0:
                    # VERY exceptional case: erroneously NCBI parsed HSP:
                    # Score 8 (7 bits), expectation 1.7e+01, alignment length 41
                    # Query:    1622 STHTYDAC                                                TRCI----PFVDTGHKHENPTEALLDSTA 1654
                    #                TR      P  +  H +  P+++   S++
                    # Sbjct:     489 TEHIYLHT                                                TRSTWPPKPPTNASHANTKPSKSHHRSSS 525
                    if len(hsp.query.split(" ")[-1]) == len(hsp.match):
                        hsp.query = hsp.query.split(" ")[-1]
                        hsp.sbjct = hsp.sbjct.split(" ")[-1]
                    elif len(hsp.query.split(" ")[0]) == len(hsp.match):
                        hsp.query = hsp.query.split(" ")[0]
                        hsp.sbjct = hsp.sbjct.split(" ")[0]
                    elif len(hsp.query) == len(hsp.match):
                        # spaces in both query/match/sbjct
                        while hsp.query.find(" ") > 0:
                            pos =  hsp.query.find(" ")
                            if hsp.sbjct[pos] == " " and hsp.match[pos] == " ":
                                hsp.query = hsp.query[0:pos] + hsp.query[pos+1:]
                                hsp.match = hsp.match[0:pos] + hsp.match[pos+1:]
                                hsp.sbjct = hsp.sbjct[0:pos] + hsp.sbjct[pos+1:]
                            else:
                                # HPS is not repairable -> quit trying
                                break
                    elif len(hsp.query) == len(hsp.sbjct):
                        while hsp.query.find(" ") > 0:
                            pos =  hsp.query.find(" ")
                            hsp.query = hsp.query[0:pos] + hsp.query[pos+1:]
                            hsp.sbjct = hsp.sbjct[0:pos] + hsp.sbjct[pos+1:]
                        # recreate alignment match string is done upon
                        # creation of PacbP object

                    else:
                        pass

                # VERY exceptional case: HSP starts or ends with a gap
                # I expect this is an error in Blastp .... 
                strip_exterior_gaps(hsp)
                try:
                    pacbp = pacb.PacbP(blastp_hsp=hsp,MATRIX=blastoptions.MATRIX)
                except:
                    # VERY exceptional miscelaneous cases: erroneously NCBI parsed HSP:
                    print hsp
                    print "'%s' X" % hsp.query, len(hsp.query), hsp.query_start, hsp.query_end
                    print "'%s' X" % hsp.match, len(hsp.match)
                    print "'%s' X" % hsp.sbjct, len(hsp.sbjct), hsp.sbjct_start, hsp.sbjct_end
                    pacbp = pacb.PacbP(blastp_hsp=hsp,MATRIX=blastoptions.MATRIX)


                # make pacbp of this hsp
                pacbp = pacb.PacbP(blastp_hsp=hsp,MATRIX=blastoptions.MATRIX)

                # if logging is requested for, print this pacbp to STDOUT
                if logging:
                    print ">>> Q", nodeQ, orfQ.tcode_symbolic(), "S", nodeS, orfS.tcode_symbolic(), pacbp, blastoptions.MATRIX.name, hsp.expect, hsp.bits
                    print ">>>", blastoptions.extra_blastp_params
                    if pacbp.length > 100:
                        print pacbp.query[0:40]+'.'*7+str(pacbp.length-80)+'.'*7+pacbp.query[-40:]
                        print pacbp.match[0:40]+'.'*7+str(pacbp.length-80)+'.'*7+pacbp.match[-40:]
                        print pacbp.sbjct[0:40]+'.'*7+str(pacbp.length-80)+'.'*7+pacbp.sbjct[-40:]
                    else:
                        print pacbp.query
                        print pacbp.match
                        print pacbp.sbjct

                # blastoptions.BLASTP_HSP_MINIMAL_LENGTH represents the minimal
                # length of the aligned part. (To) short pacbp's are abandoned
                if pacbp.length < blastoptions.BLASTP_HSP_MINIMAL_LENGTH:
                    if pacbp.identityscore == float(pacbp.length):
                        # escape for 100% identical tiny pacbps
                        pass
                    elif pacbp.identity + pacbp.similarity == pacbp.length:
                        # escape for 100% similar tiny pacbps
                        pass
                    else:
                        #  pacbp is to small. Discard!
                        if logging: print "to small..."
                        continue

                # check if the pacbp is not conflicting with the currect GSG graph
                # if so, ignore now because it will not yield a proper edge in an (accepted) CBG!
                if GSgraph and len(GSgraph) and GSgraph.is_pacbp_conflicting_with_genestructure(pacbp,orgQ=geneQ,orgS=geneS):
                    ###print "GSGconflict!", nodeQ,nodeS, GSgraph.is_pacbp_conflicting_with_genestructure(pacbp,orgQ=geneQ,orgS=geneS), len(pacbp)
                    continue

                # here we have a potentially accepted pacbp.
                # make a/the unique key of this pacbp
                key = (pacbp.bits, pacbp.length, orfQ.id,_orfpointerS)

                # check for evalue criterion
                if (blastoptions.BLASTP_HSP_MAXIMAL_EXPECT or blastoptions.BLASTP_HSP_MAXIMAL_EXPECT==0.0) and hsp.expect > blastoptions.BLASTP_HSP_MAXIMAL_EXPECT:
                    # pacbp is long enough but has a to high evalue
                    crossdata[(geneQ,geneS)]['lowscoring_pacbs'][key] = pacbp
                    if logging: print "to low bitscore or expect"
                    continue

                # check for bitscore criterion
                if (blastoptions.BLASTP_HSP_MINIMAL_BITS or blastoptions.BLASTP_HSP_MINIMAL_BITS==0) and pacbp.bits < blastoptions.BLASTP_HSP_MINIMAL_BITS:
                    # pacbp is long enough but has a to low bitscore
                    crossdata[(geneQ,geneS)]['lowscoring_pacbs'][key] = pacbp
                    if logging: print "to low bitscore or expect"
                    continue

                # !!Hurray!! an accepted pacbp. Store to crossdata
                # store it to the 'accepted_pacbs' dict of crossdata
                crossdata[(geneQ,geneS)]['accepted_pacbs'][key] = pacbp
                hitcnt+=1
                if logging: print "ACCEPTED"
                # done -> check next orf!

    # return the filled crossdata structure
    return crossdata, hitcnt
示例#5
0
def blastanalysescbgjunction(
        gsg,
        prevCBG,
        nextCBG,
        omit_cbg_orfs=False,
        omit_non_cbg_orfs=False,
        extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
        omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
        verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org, orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf
        for org, orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,
            prevCBG,
            nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
        ).split("\n"))

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org, orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org, orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
                del (fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del (fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):", len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return []

    # check if all organisms are still covered
    orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return []

    # create !single! fasta database
    fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa"
    writeMultiFasta(fastadbmfa, fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ, orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ + "_orf_" + str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       protseq,
                                       fastadbname,
                                       extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS, _orfSid = alignment.title.replace(">",
                                                         "").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS, int(_orfSid))
                orfS = orfs[nodeS]

                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return []

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print[p.bitscore for p in dpcpacbpcol.pacbps.values()]
        print "PCG nodes:", dpcpacbpcol.get_ordered_nodes()
    ################################################################

    #### do some transformations on the pacbpcol
    ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1)
    ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
    ####        edges=gsg.node_count()-1 , max_missing_edges=0 )
    ##### convert to list of CBGs and do some transformations
    ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    ####cbgList.remove_all_but_complete_cbgs()
    ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    ####cbgList.remove_cbgs_without_omsr()
    ####cbgList.update_edge_weights_by_minimal_spanning_range()
    ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2])
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity)
    max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.make_pacbps_for_missing_edges()
    cbgList.remove_all_but_complete_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    # and create_cache() for these CBGs
    for cbg in cbgList:
        cbg.create_cache()

    ####################################################################
    if verbose:
        print stw.lap(), "CBGs created", len(cbgList)
        for newcbg in cbgList:
            print "new:", newcbg
    ####################################################################

    # return list with CBGs
    return cbgList.codingblockgraphs
示例#6
0
def get_reverse_cbg(cbg, frame, verbose=False):
    """
    Get the ReversecomplementCodingBlockGraph in requested frame of this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to reversecomplement

    @type  frame: integer
    @param frame: 0,1 or 2

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  ReversecomplementCodingBlockGraph or None
    @return: ReversecomplementCodingBlockGraph (when existing) or None
    """
    min_orf_length = (cbg.omsrlength() / 2) * 3
    orfs = get_reverse_strand_orfsets(cbg,
                                      frame,
                                      min_orf_length=min_orf_length)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        fname = "%s_reversecbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[len(o.protein_sequence) for o in orfs[org].orfs]
        ########################################################################

    revpacbps = {}
    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        revpacbporfs = {}
        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfS = orfs[orgS].get_orf_by_id(
                    alignment.title.replace(">", ""))
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]
                # skip if hsp is very short
                if len(hsp.query) < cbg.omsrlength() / 2: continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                    ###pacbporf.print_protein_and_dna()
                ################################################################

                nodeQ = (orgQ, orfQ.protein_startPY)
                nodeS = (orgS, orfS.protein_startPY)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_complete_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    ############################################################################
    if verbose:
        for revcbg in cbgList:
            print "revCBG:", revcbg
    ############################################################################

    if not cbgList:
        # no CBG on the reverse strand
        return None
    else:
        # return the highest scoring CBG as a ReversecomlementCodingBlockGraph
        return CodingBlockGraph2ReversecomlementCodingBlockGraph(
            cbgList.codingblockgraphs[0])
示例#7
0
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print [ orf.id for orf in orfs[org].orfs ],
            print [ str(orf) for orf in orfs[org].orfs ]
        ########################################################################

    for orgQ,orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,orfQ.protein_sequence,
                        dbname="./"+blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfid = alignment.title.replace(">","").split(" ")[0].replace("_","")
                orfS = orfs[orgS].get_orf_by_id(int(orfid))

                nodeQ = ( orgQ, orfQ.id )
                nodeS = ( orgS, orfS.id )
                if nodeQ in cbg.get_nodes() and nodeS in cbg.get_nodes():
                    pacbporf = cbg.get_pacbps_by_nodes(node1=nodeQ,node2=nodeS)[0]

                else:
                    # take only the *best* HSP (highest scoring first one)
                    hsp = alignment.hsps[0]
示例#8
0
def blastanalysescbgjunction(gsg,prevCBG,nextCBG,
    omit_cbg_orfs = False,
    omit_non_cbg_orfs = False,
    extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
    omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
    verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org,orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf
        for org,orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,prevCBG,nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
            ).split("\n")
        )

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):",len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org,orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org,orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
               del(fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del(fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):",len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return [] 

    # check if all organisms are still covered
    orgSet = Set([ k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return [] 

    # create !single! fasta database
    fastadbname = prevCBG.barcode()+"_"+nextCBG.barcode()+".mfa"
    writeMultiFasta(fastadbmfa,fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ,orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ,orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ+"_orf_"+str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ+"_orf_"+str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,protseq,fastadbname,
                    extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS,_orfSid = alignment.title.replace(">","").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS,int(_orfSid))
                orfS  = orfs[nodeS]
               
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = ( orgQ, orfQ.id )
                nodeS = ( orgS, orfS.id )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return [] 

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print [ p.bitscore for p in dpcpacbpcol.pacbps.values() ]