Пример #1
0
def get_reverse_cbg(cbg,frame,verbose=False):
    """
    Get the ReversecomplementCodingBlockGraph in requested frame of this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to reversecomplement

    @type  frame: integer
    @param frame: 0,1 or 2

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  ReversecomplementCodingBlockGraph or None
    @return: ReversecomplementCodingBlockGraph (when existing) or None
    """
    min_orf_length = (cbg.omsrlength()/2)*3
    orfs = get_reverse_strand_orfsets(cbg,frame,min_orf_length=min_orf_length)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        fname = "%s_reversecbg_%s.mfa" % (org,cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(),fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print [len(o.protein_sequence) for o in orfs[org].orfs ]
        ########################################################################

    revpacbps = {}
    for orgQ,orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        revpacbporfs = {}
        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,orfQ.protein_sequence,
                        dbname="./"+blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfS = orfs[orgS].get_orf_by_id(alignment.title.replace(">",""))
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]
                # skip if hsp is very short
                if len(hsp.query) < cbg.omsrlength()/2: continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                    ###pacbporf.print_protein_and_dna()
                ################################################################

                nodeQ = ( orgQ, orfQ.protein_startPY )
                nodeS = ( orgS, orfS.protein_startPY )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print [ pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values() ]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count()-1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
            edges=cbg.node_count()-1 , max_missing_edges=0 )
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    cbgList.remove_all_but_complete_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    ############################################################################
    if verbose:
        for revcbg in cbgList:
            print "revCBG:", revcbg
    ############################################################################

    if not cbgList:
        # no CBG on the reverse strand
        return None
    else:
        # return the highest scoring CBG as a ReversecomlementCodingBlockGraph
        return CodingBlockGraph2ReversecomlementCodingBlockGraph(
                cbgList.codingblockgraphs[0])
Пример #2
0
def createblastdbs(input,
                   GSG,
                   OPTIONS,
                   dbfraction=None,
                   organism=None,
                   acceptorfids=[],
                   rejectorfids=[]):
    """
    (Re)create blast-db's by masking the areas thar are incorporated in the GSG

    @type  input: dict
    @param input: `input` data structure dictionary

    @type  GSG: GenestructureOfCodingBlockGraphs
    @param GSG: GenestructureOfCodingBlockGraphs instance

    @type  OPTIONS: optparse options instance
    @param OPTIONS: optparse options instance (with attribute 'abinitio')

    @type  dbfraction: string
    @param dbfraction: None, 'all', 'GSGupstream', 'GSGcentral', 'GSGdownstream', 'annotation'

    @type  organism: organism identifier
    @param organism: only recreate blastdb for this organism/gene identifier

    @type  acceptorfids: list with integers
    @param acceptorfids: list of orf ids to accept

    @type  rejectorfids: list with integers
    @param rejectorfids: list of orf ids to reject

    @attention: acceptorfids and rejectorfids are only used when organism is specified!
    """
    seqsindb = {}
    for org in input.keys():
        # if organism is given, do only this one
        if organism and org != organism: continue
        # acceptorfids anc rejectorfids only valid in combi with `organism`
        if not organism: acceptorfids, rejectorfids = [], []

        # assign blast database name / multi fasta file and open filehandle
        uniquetag = get_random_string_tag()
        fname = '%s-blastdb-%s.fa' % (uniquetag, org)
        fullpath = osPathJoin(OPTIONS.outdir, fname)
        fh = open(fullpath, 'w')
        seqsindb[org] = 0

        # distinct cases possible:
        if len(GSG):
            # there is already a GSG, so this is not the first blast iteration
            # do not apply a shortcut when OPTIONS.abinitio == False
            coords = GSG.omsr2mask(org)
            if dbfraction == 'GSGupstream':
                # take only orfs LEFT of the first CBG in GSG
                max_orf_nt_start = max(
                    GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    max_orf_start=max_orf_nt_start,
                    acceptorfids=acceptorfids,
                    rejectorfids=rejectorfids)
            elif dbfraction == 'GSGdownstream':
                # take only orfs RIGTH of the last CBG in GSG
                min_orf_nt_end = min(
                    GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    min_orf_end=min_orf_nt_end,
                    acceptorfids=acceptorfids,
                    rejectorfids=rejectorfids)
            elif dbfraction == 'GSGcentral':
                # take only orfs in between FIRST and LAST CBG in GSG (can be only one CBG!)
                max_orf_nt_start = max(
                    GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                min_orf_nt_end = min(
                    GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    min_orf_end=min_orf_nt_end,
                    max_orf_start=max_orf_nt_start,
                    acceptorfids=acceptorfids,
                    rejectorfids=rejectorfids)
            else:
                # dbfraction equals 'all' or None -> no limitation, just take all orfs!
                # do only the general limitation on sublists of orfids
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    acceptorfids=acceptorfids, rejectorfids=rejectorfids)

            # create masked fasta of this sequence part only
            newfasta = input[org]['orfs'].tomaskedfasta(coords=coords,
                                                        orflist=orflist,
                                                        header_prefix=org)
            # write to file and count accessions in this file -> seqsindb[org]
            fh.write(newfasta)
            seqsindb[org] = newfasta.count(">")

        else:
            # No filled GSG objects -> no a priori knowledge yet
            # When dbfraction=='annotated' and !OPTIONS.abinitio -> take annotated orfs only
            # TODO: dbfraction is not checked/used here -> just OPTIONS.abinitio
            for orf in input[org]['orfs'].orfs:
                # in case not abinitio, make only a db of orfs in teh current annotation!
                if OPTIONS.abinitio == False and orf.id not in input[org][
                        'orfid-genestructure']:
                    continue
                if orf.id in rejectorfids:
                    # ignore Orfs that are listed as to-be-ignored
                    continue
                if acceptorfids and orf.id not in acceptorfids:
                    # ignore Orfs that are not listed as to-be-accepted
                    continue
                # write fasta of orf to file
                fh.write(
                    orf.tofasta(header="%s_orf_%s" % (org, orf.id)) + "\n")
                # increase seqsindb[org] counter
                seqsindb[org] += 1

        # close the filehandle
        fh.close()
        # run formatdb
        formatdb(fname=fullpath)
        # set name of blastdb in infodict
        input[org]['blastdb'] = fullpath

    # return the counter of how much orf sequences are stored in the blast database
    return seqsindb
Пример #3
0
def get_frameshifted_cbg(cbg, input, verbose=True):
    """
    Get a CBG with frameshifts (in some of if Orfs) compared to this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to check for frameshifts

    @type  input: dict
    @param input: input <dict data structure> with lists of Orfs

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  CodingBlockGraph or None
    @return: CodingBlockGraph (when existing) or None
    """

    # get elegiable lists of Orfs
    orfs = _get_elegiable_frameshift_orfsets(cbg, input)

    # check how many Orfs are elgiable...
    if sum([len(l.orfs) for l in orfs.values()]) == cbg.node_count():
        # no frameshift possible here...
        return None

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        # REMAP fastaheaders as ids to retrieve the Orfs after blast..
        for orf in orfs[org].orfs:
            orf.fastaheader = str(orf.id)
        fname = "%s_frameshiftcbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[orf.id for orf in orfs[org].orfs],
            print[str(orf) for orf in orfs[org].orfs]
        ########################################################################

    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfid = alignment.title.replace(">", "").split(" ")[0].replace(
                    "_", "")
                orfS = orfs[orgS].get_orf_by_id(int(orfid))

                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                if nodeQ in cbg.get_nodes() and nodeS in cbg.get_nodes():
                    pacbporf = cbg.get_pacbps_by_nodes(node1=nodeQ,
                                                       node2=nodeS)[0]

                else:
                    # take only the *best* HSP (highest scoring first one)
                    hsp = alignment.hsps[0]

                    # correct to absolute positions
                    hsp.query_start = hsp.query_start + orfQ.protein_startPY
                    hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                    # initialize the PacbP
                    pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)
                    ############################################################
                    if verbose: print "NEW:", pacbporf
                    ############################################################

                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        ############################################################
        if verbose: print "org_set_size() PCG < CBG"
        ############################################################
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(cbg.node_count())
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_graphlist_by_total_weight_and_identity()

    ############################################################################
    if verbose:
        print "FScbgs (%s)" % len(cbgList)
        for fscbg in cbgList:
            print fscbg
    ############################################################################

    if not cbgList:
        # no (better) frameshifted CBG
        return None
    elif cbgList and not cbgList[0].node_set().symmetric_difference(
            cbg.node_set()):
        # best CBG is not frameshifted, but CBG itself
        return None
    else:
        # score the difference between the frameshifted and current CBG
        score_cbg = cbg.total_weight() * cbg.omsr_identityscore()
        score_fscbg = cbgList[0].total_weight(
        ) * cbgList[0].omsr_identityscore()
        # check overlap between the frameshifted and current CBG
        a, b, c, d, e, f, g = relatively_positioned_towards(cbgList[0], cbg)

        ########################################################################
        if verbose:
            print "CBG", cbg
            cbg.printmultiplealignment()
            for fscbg in cbgList:
                print "fsCBG:", fscbg
                fscbg.printmultiplealignment()
        ########################################################################

        if (c, d) == ((0, 0, 1), (1, 0, 0)) or (c, d) == ((0, 0, 1),
                                                          (1, 0, 0)):
            # CBG and frameshifted CBG do not share a single AA overlap...
            # This does not represent a frameshifted CBG as we searched for
            return False
        elif score_fscbg > score_cbg:
            # return the highest scoring, frameshifted CBG
            return cbgList[0]
        else:
            # no, still not convinced that this is a frameshifted CBG
            return False
Пример #4
0
def createblastdbs(input,GSG,OPTIONS,dbfraction=None,organism=None,acceptorfids=[],rejectorfids=[]):
    """
    (Re)create blast-db's by masking the areas thar are incorporated in the GSG

    @type  input: dict
    @param input: `input` data structure dictionary

    @type  GSG: GenestructureOfCodingBlockGraphs
    @param GSG: GenestructureOfCodingBlockGraphs instance

    @type  OPTIONS: optparse options instance
    @param OPTIONS: optparse options instance (with attribute 'abinitio')

    @type  dbfraction: string
    @param dbfraction: None, 'all', 'GSGupstream', 'GSGcentral', 'GSGdownstream', 'annotation'

    @type  organism: organism identifier
    @param organism: only recreate blastdb for this organism/gene identifier

    @type  acceptorfids: list with integers
    @param acceptorfids: list of orf ids to accept

    @type  rejectorfids: list with integers
    @param rejectorfids: list of orf ids to reject

    @attention: acceptorfids and rejectorfids are only used when organism is specified!
    """
    seqsindb = {}
    for org in input.keys():
        # if organism is given, do only this one
        if organism and org!=organism: continue
        # acceptorfids anc rejectorfids only valid in combi with `organism`
        if not organism: acceptorfids, rejectorfids = [], [] 
  
        # assign blast database name / multi fasta file and open filehandle
        uniquetag = get_random_string_tag()
        fname = '%s-blastdb-%s.fa' % (uniquetag,org)
        fullpath = osPathJoin(OPTIONS.outdir,fname)
        fh = open(fullpath,'w')
        seqsindb[org] = 0

        # distinct cases possible:
        if len(GSG):
            # there is already a GSG, so this is not the first blast iteration
            # do not apply a shortcut when OPTIONS.abinitio == False
            coords = GSG.omsr2mask(org)
            if dbfraction == 'GSGupstream':
                # take only orfs LEFT of the first CBG in GSG
                max_orf_nt_start = max(GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(max_orf_start=max_orf_nt_start,
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)
            elif dbfraction == 'GSGdownstream':
                # take only orfs RIGTH of the last CBG in GSG
                min_orf_nt_end = min(GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(min_orf_end=min_orf_nt_end,
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)
            elif dbfraction == 'GSGcentral':
                # take only orfs in between FIRST and LAST CBG in GSG (can be only one CBG!)
                max_orf_nt_start = max(GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                min_orf_nt_end   = min(GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(min_orf_end=min_orf_nt_end,
                        max_orf_start=max_orf_nt_start,
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)
            else:
                # dbfraction equals 'all' or None -> no limitation, just take all orfs!
                # do only the general limitation on sublists of orfids
                orflist = input[org]['orfs'].get_elegiable_orfs(
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)

            # create masked fasta of this sequence part only
            newfasta = input[org]['orfs'].tomaskedfasta(coords=coords,orflist=orflist,header_prefix=org)
            # write to file and count accessions in this file -> seqsindb[org]
            fh.write(newfasta)
            seqsindb[org] = newfasta.count(">")

        else:
            # No filled GSG objects -> no a priori knowledge yet
            # When dbfraction=='annotated' and !OPTIONS.abinitio -> take annotated orfs only
            # TODO: dbfraction is not checked/used here -> just OPTIONS.abinitio
            for orf in input[org]['orfs'].orfs:
                # in case not abinitio, make only a db of orfs in teh current annotation!
                if OPTIONS.abinitio == False and orf.id not in input[org]['orfid-genestructure']:
                    continue
                if orf.id in rejectorfids:
                    # ignore Orfs that are listed as to-be-ignored
                    continue
                if acceptorfids and orf.id not in acceptorfids:
                    # ignore Orfs that are not listed as to-be-accepted
                    continue
                # write fasta of orf to file
                fh.write(orf.tofasta(header="%s_orf_%s" % (org,orf.id))+"\n")
                # increase seqsindb[org] counter
                seqsindb[org]+=1

        # close the filehandle
        fh.close()
        # run formatdb
        formatdb(fname=fullpath)
        # set name of blastdb in infodict
        input[org]['blastdb'] = fullpath

    # return the counter of how much orf sequences are stored in the blast database
    return seqsindb
Пример #5
0
def blastanalysescbgjunction(
        gsg,
        prevCBG,
        nextCBG,
        omit_cbg_orfs=False,
        omit_non_cbg_orfs=False,
        extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
        omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
        verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org, orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf
        for org, orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,
            prevCBG,
            nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
        ).split("\n"))

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org, orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org, orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
                del (fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del (fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):", len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return []

    # check if all organisms are still covered
    orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return []

    # create !single! fasta database
    fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa"
    writeMultiFasta(fastadbmfa, fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ, orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ + "_orf_" + str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       protseq,
                                       fastadbname,
                                       extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS, _orfSid = alignment.title.replace(">",
                                                         "").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS, int(_orfSid))
                orfS = orfs[nodeS]

                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return []

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print[p.bitscore for p in dpcpacbpcol.pacbps.values()]
        print "PCG nodes:", dpcpacbpcol.get_ordered_nodes()
    ################################################################

    #### do some transformations on the pacbpcol
    ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1)
    ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
    ####        edges=gsg.node_count()-1 , max_missing_edges=0 )
    ##### convert to list of CBGs and do some transformations
    ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    ####cbgList.remove_all_but_complete_cbgs()
    ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    ####cbgList.remove_cbgs_without_omsr()
    ####cbgList.update_edge_weights_by_minimal_spanning_range()
    ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2])
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity)
    max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.make_pacbps_for_missing_edges()
    cbgList.remove_all_but_complete_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    # and create_cache() for these CBGs
    for cbg in cbgList:
        cbg.create_cache()

    ####################################################################
    if verbose:
        print stw.lap(), "CBGs created", len(cbgList)
        for newcbg in cbgList:
            print "new:", newcbg
    ####################################################################

    # return list with CBGs
    return cbgList.codingblockgraphs
Пример #6
0
def get_reverse_cbg(cbg, frame, verbose=False):
    """
    Get the ReversecomplementCodingBlockGraph in requested frame of this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to reversecomplement

    @type  frame: integer
    @param frame: 0,1 or 2

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  ReversecomplementCodingBlockGraph or None
    @return: ReversecomplementCodingBlockGraph (when existing) or None
    """
    min_orf_length = (cbg.omsrlength() / 2) * 3
    orfs = get_reverse_strand_orfsets(cbg,
                                      frame,
                                      min_orf_length=min_orf_length)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        fname = "%s_reversecbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[len(o.protein_sequence) for o in orfs[org].orfs]
        ########################################################################

    revpacbps = {}
    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        revpacbporfs = {}
        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfS = orfs[orgS].get_orf_by_id(
                    alignment.title.replace(">", ""))
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]
                # skip if hsp is very short
                if len(hsp.query) < cbg.omsrlength() / 2: continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                    ###pacbporf.print_protein_and_dna()
                ################################################################

                nodeQ = (orgQ, orfQ.protein_startPY)
                nodeS = (orgS, orfS.protein_startPY)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_complete_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    ############################################################################
    if verbose:
        for revcbg in cbgList:
            print "revCBG:", revcbg
    ############################################################################

    if not cbgList:
        # no CBG on the reverse strand
        return None
    else:
        # return the highest scoring CBG as a ReversecomlementCodingBlockGraph
        return CodingBlockGraph2ReversecomlementCodingBlockGraph(
            cbgList.codingblockgraphs[0])
Пример #7
0
        # REMAP fastaheaders as ids to retrieve the Orfs after blast..
        for orf in orfs[org].orfs: orf.fastaheader = str(orf.id)
        fname = "%s_frameshiftcbg_%s.mfa" % (org,cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(),fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print [ orf.id for orf in orfs[org].orfs ],
            print [ str(orf) for orf in orfs[org].orfs ]
        ########################################################################

    for orgQ,orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,orfQ.protein_sequence,
                        dbname="./"+blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfid = alignment.title.replace(">","").split(" ")[0].replace("_","")
                orfS = orfs[orgS].get_orf_by_id(int(orfid))

                nodeQ = ( orgQ, orfQ.id )
                nodeS = ( orgS, orfS.id )
Пример #8
0
def blastanalysescbgjunction(gsg,prevCBG,nextCBG,
    omit_cbg_orfs = False,
    omit_non_cbg_orfs = False,
    extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
    omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
    verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org,orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf
        for org,orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,prevCBG,nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
            ).split("\n")
        )

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):",len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org,orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org,orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
               del(fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del(fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):",len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return [] 

    # check if all organisms are still covered
    orgSet = Set([ k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return [] 

    # create !single! fasta database
    fastadbname = prevCBG.barcode()+"_"+nextCBG.barcode()+".mfa"
    writeMultiFasta(fastadbmfa,fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ,orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ,orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ+"_orf_"+str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ+"_orf_"+str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,protseq,fastadbname,
                    extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS,_orfSid = alignment.title.replace(">","").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS,int(_orfSid))
                orfS  = orfs[nodeS]
               
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = ( orgQ, orfQ.id )
                nodeS = ( orgS, orfS.id )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return [] 

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print [ p.bitscore for p in dpcpacbpcol.pacbps.values() ]