def create_pacbpcollectiongraph_from_crossdata( crossdata, increment_to_graph=None): """ Create (incremental) PacbpCollectionGraph from crossdata dict structure @type crossdata: dict @param crossdata: crossdata <dict data structure> @type increment_to_graph: None or PacbpCollectionGraph @param increment_to_graph: when applied, increment crossdata to applied PCG @rtype g: PacbpCollectionGraph @return g: PacbpCollectionGraph """ if increment_to_graph: # add new nodes to existing graph g = increment_to_graph else: # create a new blank graph from graphAbgp import PacbpCollectionGraph g = PacbpCollectionGraph() for (orgA,orgB) in crossdata.keys(): keys = crossdata[(orgA,orgB)]['accepted_pacbs'].keys() # sort keys in order to start with highest bitscore keys.sort() keys.reverse() for key in keys: (bitscore,lenght,pointerA,pointerB) = key nodeA = (orgA,pointerA) nodeB = (orgB,pointerB) # check if (org,ORF) node exist already if nodeA not in g.get_nodes(): g.add_node(nodeA) if nodeB not in g.get_nodes(): g.add_node(nodeB) if g.has_edge(nodeA,nodeB): wt = g.get_edge_weight(nodeA,nodeB) if bitscore > wt: g.set_edge_weight(nodeA,nodeB,bitscore) else: pass else: # and create a new edge g.add_edge(nodeA,nodeB,wt=bitscore) # ready! return g
def get_reverse_cbg(cbg,frame,verbose=False): """ Get the ReversecomplementCodingBlockGraph in requested frame of this CBG @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph to reversecomplement @type frame: integer @param frame: 0,1 or 2 @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes @rtype: ReversecomplementCodingBlockGraph or None @return: ReversecomplementCodingBlockGraph (when existing) or None """ min_orf_length = (cbg.omsrlength()/2)*3 orfs = get_reverse_strand_orfsets(cbg,frame,min_orf_length=min_orf_length) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps for org in orfs.keys(): fname = "%s_reversecbg_%s.mfa" % (org,cbg.barcode()) writeMultiFasta(orfs[org].tofastadict(),fname) multifastas[org] = fname ######################################################################## if verbose: print "ORFS:", org, len(orfs[org].orfs), print [len(o.protein_sequence) for o in orfs[org].orfs ] ######################################################################## revpacbps = {} for orgQ,orgS in cbg.pairwisecrosscombinations_organism(): # create blastdb if it does not exist yet if not blastdbs.has_key(orgS): formatdb(fname=multifastas[orgS]) blastdbs[orgS] = multifastas[orgS] revpacbporfs = {} for orfQ in orfs[orgQ].orfs: # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id,orfQ.protein_sequence, dbname="./"+blastdbs[orgS]) if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # obtain coordinates from sbjct orf identifier orfS = orfs[orgS].get_orf_by_id(alignment.title.replace(">","")) # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # skip if hsp is very short if len(hsp.query) < cbg.omsrlength()/2: continue # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp),orfQ,orfS) ################################################################ if verbose: print pacbporf, orgQ,orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ###pacbporf.print_protein_and_dna() ################################################################ nodeQ = ( orgQ, orfQ.protein_startPY ) nodeS = ( orgS, orfS.protein_startPY ) uqkey = pacbporf.construct_unique_key(nodeQ,nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([ fname+".*" for fname in blastdbs.values()]) if not pacbpcol.organism_set_size() == cbg.organism_set_size(): # no CBG on the reverse strand return None # ``deepcopy`` PacbPcollection dpcpacbpcol.add_nodes( pacbpcol.get_nodes() ) for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore,length,orfQid,orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore) ############################################################################ if verbose: print pacbpcol, "bitscores:", print [ pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values() ] ############################################################################ # do some transformations on the pacbpcol pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count()-1) splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=cbg.node_count()-1 , max_missing_edges=0 ) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={}) cbgList.remove_all_but_complete_cbgs() cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_list_by_attribute(order_by='total_weight',reversed=True) ############################################################################ if verbose: for revcbg in cbgList: print "revCBG:", revcbg ############################################################################ if not cbgList: # no CBG on the reverse strand return None else: # return the highest scoring CBG as a ReversecomlementCodingBlockGraph return CodingBlockGraph2ReversecomlementCodingBlockGraph( cbgList.codingblockgraphs[0])
def get_frameshifted_cbg(cbg, input, verbose=True): """ Get a CBG with frameshifts (in some of if Orfs) compared to this CBG @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph to check for frameshifts @type input: dict @param input: input <dict data structure> with lists of Orfs @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes @rtype: CodingBlockGraph or None @return: CodingBlockGraph (when existing) or None """ # get elegiable lists of Orfs orfs = _get_elegiable_frameshift_orfsets(cbg, input) # check how many Orfs are elgiable... if sum([len(l.orfs) for l in orfs.values()]) == cbg.node_count(): # no frameshift possible here... return None # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps for org in orfs.keys(): # REMAP fastaheaders as ids to retrieve the Orfs after blast.. for orf in orfs[org].orfs: orf.fastaheader = str(orf.id) fname = "%s_frameshiftcbg_%s.mfa" % (org, cbg.barcode()) writeMultiFasta(orfs[org].tofastadict(), fname) multifastas[org] = fname ######################################################################## if verbose: print "ORFS:", org, len(orfs[org].orfs), print[orf.id for orf in orfs[org].orfs], print[str(orf) for orf in orfs[org].orfs] ######################################################################## for orgQ, orgS in cbg.pairwisecrosscombinations_organism(): # create blastdb if it does not exist yet if not blastdbs.has_key(orgS): formatdb(fname=multifastas[orgS]) blastdbs[orgS] = multifastas[orgS] for orfQ in orfs[orgQ].orfs: # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id, orfQ.protein_sequence, dbname="./" + blastdbs[orgS]) if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # obtain coordinates from sbjct orf identifier orfid = alignment.title.replace(">", "").split(" ")[0].replace( "_", "") orfS = orfs[orgS].get_orf_by_id(int(orfid)) nodeQ = (orgQ, orfQ.id) nodeS = (orgS, orfS.id) if nodeQ in cbg.get_nodes() and nodeS in cbg.get_nodes(): pacbporf = cbg.get_pacbps_by_nodes(node1=nodeQ, node2=nodeS)[0] else: # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp), orfQ, orfS) ############################################################ if verbose: print "NEW:", pacbporf ############################################################ uqkey = pacbporf.construct_unique_key(nodeQ, nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([fname + ".*" for fname in blastdbs.values()]) if not pacbpcol.organism_set_size() == cbg.organism_set_size(): ############################################################ if verbose: print "org_set_size() PCG < CBG" ############################################################ # no CBG on the reverse strand return None # ``deepcopy`` PacbPcollection dpcpacbpcol.add_nodes(pacbpcol.get_nodes()) for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore, length, orfQid, orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore) ############################################################################ if verbose: print pacbpcol, "bitscores:", print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()] ############################################################################ # do some transformations on the pacbpcol pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() - 1) splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=cbg.node_count() - 1, max_missing_edges=0) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={}) cbgList.remove_all_but_cbgs() cbgList.remove_cbgs_with_lt_nodes(cbg.node_count()) cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_graphlist_by_total_weight_and_identity() ############################################################################ if verbose: print "FScbgs (%s)" % len(cbgList) for fscbg in cbgList: print fscbg ############################################################################ if not cbgList: # no (better) frameshifted CBG return None elif cbgList and not cbgList[0].node_set().symmetric_difference( cbg.node_set()): # best CBG is not frameshifted, but CBG itself return None else: # score the difference between the frameshifted and current CBG score_cbg = cbg.total_weight() * cbg.omsr_identityscore() score_fscbg = cbgList[0].total_weight( ) * cbgList[0].omsr_identityscore() # check overlap between the frameshifted and current CBG a, b, c, d, e, f, g = relatively_positioned_towards(cbgList[0], cbg) ######################################################################## if verbose: print "CBG", cbg cbg.printmultiplealignment() for fscbg in cbgList: print "fsCBG:", fscbg fscbg.printmultiplealignment() ######################################################################## if (c, d) == ((0, 0, 1), (1, 0, 0)) or (c, d) == ((0, 0, 1), (1, 0, 0)): # CBG and frameshifted CBG do not share a single AA overlap... # This does not represent a frameshifted CBG as we searched for return False elif score_fscbg > score_cbg: # return the highest scoring, frameshifted CBG return cbgList[0] else: # no, still not convinced that this is a frameshifted CBG return False
def detect_and_remove_single_nonfinal_inwpcbg(inwpcbgs,PCG,GENE_IDENTIFIER_SET, verbose=False): """ Allow deletion of a very shitty, single inwpCBG from the end of the list """ # we need at least 2 inwpCBGs in order to remove one of them if len(inwpcbgs) <= 1: return False lastInwpCBG = inwpcbgs[-1] prevInwpCBG = inwpcbgs[-2] lastNodeList = [ lastInwpCBG.get_organism_nodes(org)[0] for org in\ lastInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ] prevNodeList = [ prevInwpCBG.get_organism_nodes(org)[0] for org in\ prevInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ] # identical nodes -> do not delete. Only go for very obvious things if Set(lastNodeList).intersection(prevNodeList): return False ntdistdict = prevInwpCBG.nt_spacing_between_codingblocks([lastInwpCBG]) tcodedistdict = prevInwpCBG.tcode_spacing_between_codingblocks([lastInwpCBG]) check1 = prevInwpCBG.count_orfs_labeled_as_annotated_exon() >\ lastInwpCBG.count_orfs_labeled_as_annotated_exon() check2 = prevInwpCBG.get_bitscore() > lastInwpCBG.get_bitscore() check3 = len(prevNodeList) > len(lastNodeList) check4 = float(lastInwpCBG.count_orfs_labeled_as_annotated_exon()) /\ float(len(GENE_IDENTIFIER_SET)) <= 0.33 if ntdistdict: check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\ MIN_INTERGENIC_NT_LENGTH else: check5 = False if tcodedistdict: check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\ TCODE_MAX_NONCODING else: check6 = False check7 = prevInwpCBG.get_projected_tailing_stop_aa_difference() <\ lastInwpCBG.get_projected_tailing_stop_aa_difference() check8 = prevInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()<\ lastInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference() checklist = [check1,check2,check3,check4,check5,check6,check7,check8] ############################################################################ if verbose: print "NonFinal inwpCBG check:", checklist ############################################################################ if checklist.count(False) == 0: nonfinalPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix) # place all PacbPORFs in the nonfinalPCG for (pacbpkey,nodeQ,nodeS), pacbporf in lastInwpCBG.pacbps.iteritems(): # add to noncodingnongenePCG nonfinalPCG.add_node(nodeQ) nonfinalPCG.add_node(nodeS) nonfinalPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) nonfinalPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG,(pacbpkey,nodeQ,nodeS)) # return nonfinalPCG return nonfinalPCG else: return False
def detect_and_remove_utrornonegene_inwpcbgs(inwpcbgs,PCG,verbose=True): """ """ # if empty list or empty PCG provided: return False if not inwpcbgs or not PCG or PCG.node_count() == 0: return False # MAKE SHURE ALL Orfs HAVE PREDICTED TSS SITES!! for inwpCBG in inwpcbgs: inwpCBG.scan_orfs_for_pssm_tss(min_pssm_score=TSS_MIN_PSSM_SCORE) # get target organism identifier target = inwpcbgs[0]._get_target_organism() # detect inwpCBGs which are most likely 5' and 3' non coding or non gene ncng_5p_list = assign_utrornongene5p_inwpcbgs(inwpcbgs) ncng_3p_list = assign_utrornongene3p_inwpcbgs(inwpcbgs) ncng_list = ncng_5p_list ncng_list.extend(ncng_3p_list) # return False in no inwpcbgs are assigned if not ncng_list: return False # get list of inwpCBGs that are NON ncng correct_inwpcbg_list = [] check_str_list = [] for discrinwpCBG in ncng_list: check_str_list.append( str(discrinwpCBG) ) for inwpcbg in inwpcbgs: if str(inwpcbg) not in check_str_list: correct_inwpcbg_list.append( inwpcbg ) # get all pacbp keys belonging to noncoding / nongene inwpcbgs ONLY ncng_pacbpkeys = [] for ncnginwpCBG in ncng_list: for pacbpkey in ncnginwpCBG.pacbps.keys(): # check if this pacbpkey is occuring in a non-removed inwpCBG is_occurring_in_correct_inwpcbg = False for inwp in correct_inwpcbg_list: if pacbpkey in inwp.pacbps.keys(): is_occurring_in_correct_inwpcbg = True break # if is_occurring_in_correct_inwpcbg, continue and do not delete if is_occurring_in_correct_inwpcbg: continue # store to gtgdiscrepancy_pacbpkeys when not stored already if pacbpkey not in ncng_pacbpkeys: ncng_pacbpkeys.append(pacbpkey) # place all ncng_pacbpkeys and PacbPORFs in the noncodingnongenePCG # and, at the same time, remove from the main PCG noncodingnongenePCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix) for key in ncng_pacbpkeys: (pacbpkey,nodeQ,nodeS) = key pacbporf = PCG.pacbps[key] # add to noncodingnongenePCG noncodingnongenePCG.add_node(nodeQ) noncodingnongenePCG.add_node(nodeS) noncodingnongenePCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) noncodingnongenePCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG,key) # return noncodingnongenePCG return noncodingnongenePCG
def detect_and_remove_gtgdiscrepancy(inwpcbgs,PCG,GENE_IDENTIFIER_SET,verbose=True): """ """ # if empty list or empty PCG provided: return False if not inwpcbgs or not PCG or PCG.node_count() == 0: return False # get target organism identifier target = inwpcbgs[0]._get_target_organism() # Make *the* GTG of the strongest X informant species # X depends on the maximum number of gene informants (GENE_IDENTIFIER_SET); # unigene informants are not taken into account here. # X is defined here by: # -- at least 3 informants (for very small number of informants) # -- optimally half of the total numers of informants # -- at most 8 informants min_gtg_node_count = 3 + 1 max_gtg_node_count = 8 + 1 gtg_size = min([(len(GENE_IDENTIFIER_SET)-1)/2, max_gtg_node_count]) gtg_size = max([min_gtg_node_count,gtg_size]) btGTG = pcg2gtg_by_bitscore(PCG,target,identifier_list=GENE_IDENTIFIER_SET) ntGTG = pcg2gtg_by_identity(PCG,target,identifier_list=GENE_IDENTIFIER_SET) # TEMP solution because OrganismGraph != OrganismStarGraph # make bitscore ordered list of nodes bitscore_ordered_nodes = [] for (tNode,iNode),wt in btGTG.weights.iteritems(): if tNode==target: bitscore_ordered_nodes.append( ( wt, iNode ) ) bitscore_ordered_nodes.sort() #if verbose: print "btGTG::", bitscore_ordered_nodes while ntGTG.node_count() > gtg_size: # next line causes errors because OrganismGraph != OrganismStarGraph # this causes the target node in rare cases to be assigned as the weakest node # informant = btGTG.weakest_connected_node() (wt,informant) = bitscore_ordered_nodes.pop(0) btGTG.del_node(informant) ntGTG.del_node(informant) if verbose: print "btGGT.weakest_connected_node() ==", informant, btGTG.get_ordered_nodes() ############################################################################ if verbose: print "ntGTG:", ntGTG.get_ordered_nodes(), for node in ntGTG.get_ordered_nodes(): if node == target: continue print "%1.2f" % ntGTG.weights[(target,node)], print "" ############################################################################ # detect inwpCBGs which are probably the result of intron alignments gtgdiscrepancy_internal_inwpcbg_list = assign_internal_nongene_alignments(inwpcbgs,ntGTG) # detect inwpCBGs with strong discrepancy to this GTG gtgdiscrepancy_inwpcbg_list = assign_gtgdiscrepancy_inwpcbgs(inwpcbgs,ntGTG) # merge both lists if gtgdiscrepancy_internal_inwpcbg_list: if not gtgdiscrepancy_inwpcbg_list: gtgdiscrepancy_inwpcbg_list.extend(gtgdiscrepancy_internal_inwpcbg_list) else: for inwpcbg in gtgdiscrepancy_internal_inwpcbg_list: check_str = str(inwpcbg) if check_str not in [ str(gtgdiscrCBG) for gtgdiscrCBG in gtgdiscrepancy_inwpcbg_list ]: gtgdiscrepancy_inwpcbg_list.append( inwpcbg ) if not gtgdiscrepancy_inwpcbg_list: return False # get list of inwpCBGs that have NO discrepancy correct_inwpcbg_list = [] check_str_list = [] for discrinwpCBG in gtgdiscrepancy_inwpcbg_list: check_str_list.append( str(discrinwpCBG) ) for inwpcbg in inwpcbgs: if str(inwpcbg) not in check_str_list: correct_inwpcbg_list.append( inwpcbg ) # get all pacbp keys belonging to gtgdiscrepancy inwpcbgs ONLY gtgdiscrepancy_pacbpkeys = [] for discrinwpCBG in gtgdiscrepancy_inwpcbg_list: for pacbpkey in discrinwpCBG.pacbps.keys(): # check if this pacbpkey is occuring in a non-removed inwpCBG is_occurring_in_correct_inwpcbg = False for inwp in correct_inwpcbg_list: if pacbpkey in inwp.pacbps.keys(): is_occurring_in_correct_inwpcbg = True break # if is_occurring_in_correct_inwpcbg, continue and do not delete if is_occurring_in_correct_inwpcbg: continue # store to gtgdiscrepancy_pacbpkeys when not stored already if pacbpkey not in gtgdiscrepancy_pacbpkeys: gtgdiscrepancy_pacbpkeys.append(pacbpkey) # place all gtgdiscrepancy_pacbpkeys and PacbPORFs in the gtgdiscrepancyPCG # and, at the same time, remove from the main PCG gtgdiscrepancyPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix) for key in gtgdiscrepancy_pacbpkeys: if key not in PCG.pacbps.keys(): # !?!? TODO why not present in the PCG !?!?! # anyway, continue here to avoid KeyError # This PacbPORF was to be deleted rigth here, # so it is not an extreme disaster. But... scary ;-) continue (pacbpkey,nodeQ,nodeS) = key pacbporf = PCG.pacbps[key] # add to gtgdiscrepancyPCG gtgdiscrepancyPCG.add_node(nodeQ) gtgdiscrepancyPCG.add_node(nodeS) gtgdiscrepancyPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) gtgdiscrepancyPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG,key) # return gtgdiscrepancyPCG return gtgdiscrepancyPCG
def detect_and_remove_single_nonfirst_inwpcbg(inwpcbgs,PCG,GENE_IDENTIFIER_SET, verbose=False): """ Allow deletion of a very shitty, single inwpCBG from the start of the list """ # we need at least 2 inwpCBGs in order to remove one of them if len(inwpcbgs) <= 1: return False firstInwpCBG = inwpcbgs[0] nextInwpCBG = inwpcbgs[1] firstNodeList = [ firstInwpCBG.get_organism_nodes(org)[0] for org in\ firstInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ] nextNodeList = [ nextInwpCBG.get_organism_nodes(org)[0] for org in\ nextInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ] # identical nodes -> do not delete. Only go for very obvious things if Set(firstNodeList).intersection(nextNodeList): return False ntdistdict = firstInwpCBG.nt_spacing_between_codingblocks([nextInwpCBG]) tcodedistdict = firstInwpCBG.tcode_spacing_between_codingblocks([nextInwpCBG]) # make a long list of checks which should be True in case # firstInwpCBG is *NOT* the first exon of this gene structure check1 = nextInwpCBG.count_orfs_labeled_as_annotated_exon() >\ firstInwpCBG.count_orfs_labeled_as_annotated_exon() check2 = nextInwpCBG.get_bitscore() > firstInwpCBG.get_bitscore() check3 = len(nextNodeList) > len(firstNodeList) check4 = float(firstInwpCBG.count_orfs_labeled_as_annotated_exon()) /\ float(len(GENE_IDENTIFIER_SET)) <= 0.33 if ntdistdict: check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\ MIN_INTERGENIC_NT_LENGTH else: check5 = False if tcodedistdict: check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\ TCODE_MAX_NONCODING else: check6 = False check7 = nextInwpCBG.count_orfs_labeled_as_first_exon() >=\ firstInwpCBG.count_orfs_labeled_as_first_exon() check8 = firstInwpCBG.count_orfs_labeled_as_annotated_exon() == 0 check9 = nextInwpCBG.get_average_upstream_methionine_pssm_score() >\ firstInwpCBG.get_average_upstream_methionine_pssm_score() checklist = [check1,check2,check3,check4,check5,check6,check7,check8,check9] ############################################################################ if verbose or True: print "NonFirst inwpCBG check:", checklist ############################################################################ if checklist.count(False) <= 1: nonfirstPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix) # place all PacbPORFs in the nonfirstPCG for (pacbpkey,nodeQ,nodeS), pacbporf in firstInwpCBG.pacbps.iteritems(): # add to noncodingnongenePCG nonfirstPCG.add_node(nodeQ) nonfirstPCG.add_node(nodeS) nonfirstPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) nonfirstPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG,(pacbpkey,nodeQ,nodeS)) # return nonfirstPCG return nonfirstPCG else: return False
def detect_and_remove_synteny(inwpcbgs, PCG, GENE_IDENTIFIER_SET, verbose=True): """ """ MIN_OBSERVED_VS_EXPECTED_RATIO = 0.20 observed_organism_subcombis = [] syntenic_subinwpcbgs = [] # detect syntenic genes in MAIN inwpCBGs, # without taking strongest informants by GTG analyses syntenic_inwpcbgs = assign_syntenic_inwpcbgs(inwpcbgs) for syntinwpcbg in syntenic_inwpcbgs: syntenic_subinwpcbgs.append(syntinwpcbg) for inwpCBG in inwpcbgs: # omit inwpCBGs with annotated exons/orfs if inwpCBG.count_orfs_labeled_as_annotated_exon() >= 2: continue target = inwpCBG._get_target_organism() # make a (artificially fully connected) GeneTreeGraph gtg = GeneTreeGraph() gtg.add_node(target) for (pacbpkey, nodeQ, nodeS), pacbporf in inwpCBG.pacbps.iteritems(): orgS = inwpCBG.organism_by_node(nodeS) if orgS not in GENE_IDENTIFIER_SET: continue gtg.add_node(orgS) for (pacbpkey, nodeQ, nodeS), pacbporf in inwpCBG.pacbps.iteritems(): orgQ = inwpCBG.organism_by_node(nodeQ) orgS = inwpCBG.organism_by_node(nodeS) if orgS not in GENE_IDENTIFIER_SET: continue gtg.add_edge(orgQ, orgS, wt=pacbporf.bitscore) # make artificially missed edges between the informants for org in inwpCBG.organism_set(): if org not in [orgQ, orgS] and org in GENE_IDENTIFIER_SET: if gtg.has_edge( orgS, org ) and\ gtg.weights[(orgS, org)] > pacbporf.bitscore: gtg.set_edge_weight(orgS, org, wt=pacbporf.bitscore) else: gtg.add_edge(orgS, org, wt=pacbporf.bitscore) # omit (nearly) empty genetreegraphs if gtg.node_count() <= 1: continue # remove (much) weaker connected nodes as expected from the gtg while gtg.get_nodes() and MIN_OBSERVED_VS_EXPECTED_RATIO >\ min( [ gtg.get_node_weighted_connectivity_observed_vs_expected(node) for node in gtg.get_nodes() ]): node = gtg.weakest_connected_node() gtg.del_node(node) # check if already tested before; present in observed_organism_subcombis if gtg.get_ordered_nodes() in observed_organism_subcombis: continue # store to already tested organism subcombinations observed_organism_subcombis.append(gtg.get_ordered_nodes()) # create a subPCG of these organisms subPCG = PacbpCollectionGraph(crossdata={}, blastmatrix=PCG._blastmatrix) for (pacbpkey, nodeQ, nodeS), pacbporf in PCG.pacbps.iteritems(): (orgQ, orfQid), (orgS, orfSid) = nodeQ, nodeS if orgQ not in gtg.get_nodes(): continue if orgS not in gtg.get_nodes(): continue subPCG.add_node(nodeQ) subPCG.add_node(nodeS) subPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) subPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf # make inwpCBGs of this subPCG subinwpcbgs = PCG2inwpCBGS(subPCG) # check if there are subinwpcbgs if not subinwpcbgs: continue ######################################################################## #if verbose: # print "subPCG organism set:", gtg.get_ordered_nodes() # print_inwpcbgstructure(subinwpcbgs,gtg.get_ordered_nodes()) ######################################################################## # create a subInwardsPointingCodingBlockGraph of these organisms #subinwpCBG = InwardsPointingCodingBlockGraph() #for (pacbpkey,nodeQ,nodeS), pacbporf in inwpCBG.pacbps.iteritems(): # (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS # if orgQ not in gtg.get_nodes(): continue # if orgS not in gtg.get_nodes(): continue # subinwpCBG.add_node(nodeQ) # subinwpCBG.add_node(nodeS) # subinwpCBG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) # subinwpCBG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # detect syntenic genes in this subinwpcbgs syntenic_inwpcbgs = assign_syntenic_inwpcbgs(subinwpcbgs) for syntinwpcbg in syntenic_inwpcbgs: syntenic_subinwpcbgs.append(syntinwpcbg) #################################################################### if verbose: print "SYNTENIC!!", syntinwpcbg, syntinwpcbg.get_ordered_nodes( ) for subCBG in subinwpcbgs: print "syntenic in:", subCBG, subCBG.get_ordered_nodes() #################################################################### if not syntenic_subinwpcbgs: return False # cleanup all inwpCBGs from the syntenic subInwpCBGs syntenic_pacbpkeys = [] for syntinwpcbg in syntenic_subinwpcbgs: node_set = syntinwpcbg.node_set() for inwpCBG in inwpcbgs: if not node_set.difference(inwpCBG.node_set()): for pacbpkey in inwpCBG.pacbps.keys(): if pacbpkey not in syntenic_pacbpkeys: syntenic_pacbpkeys.append(pacbpkey) # place all syntenic_pacbpkeys and PacbPORFs in the syntenicPCG # and, at the same time, remove from the main PCG syntenicPCG = PacbpCollectionGraph(crossdata={}, blastmatrix=PCG._blastmatrix) for key in syntenic_pacbpkeys: (pacbpkey, nodeQ, nodeS) = key pacbporf = PCG.pacbps[key] # add to syntenicPCG syntenicPCG.add_node(nodeQ) syntenicPCG.add_node(nodeS) syntenicPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) syntenicPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG, key) # return syntenicPCG return syntenicPCG
def detect_and_remove_single_nonfinal_inwpcbg(inwpcbgs, PCG, GENE_IDENTIFIER_SET, verbose=False): """ Allow deletion of a very shitty, single inwpCBG from the end of the list """ # we need at least 2 inwpCBGs in order to remove one of them if len(inwpcbgs) <= 1: return False lastInwpCBG = inwpcbgs[-1] prevInwpCBG = inwpcbgs[-2] lastNodeList = [ lastInwpCBG.get_organism_nodes(org)[0] for org in\ lastInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ] prevNodeList = [ prevInwpCBG.get_organism_nodes(org)[0] for org in\ prevInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ] # identical nodes -> do not delete. Only go for very obvious things if Set(lastNodeList).intersection(prevNodeList): return False ntdistdict = prevInwpCBG.nt_spacing_between_codingblocks([lastInwpCBG]) tcodedistdict = prevInwpCBG.tcode_spacing_between_codingblocks( [lastInwpCBG]) check1 = prevInwpCBG.count_orfs_labeled_as_annotated_exon() >\ lastInwpCBG.count_orfs_labeled_as_annotated_exon() check2 = prevInwpCBG.get_bitscore() > lastInwpCBG.get_bitscore() check3 = len(prevNodeList) > len(lastNodeList) check4 = float(lastInwpCBG.count_orfs_labeled_as_annotated_exon()) /\ float(len(GENE_IDENTIFIER_SET)) <= 0.33 if ntdistdict: check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\ MIN_INTERGENIC_NT_LENGTH else: check5 = False if tcodedistdict: check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\ TCODE_MAX_NONCODING else: check6 = False check7 = prevInwpCBG.get_projected_tailing_stop_aa_difference() <\ lastInwpCBG.get_projected_tailing_stop_aa_difference() check8 = prevInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference()<\ lastInwpCBG.get_projected_tailing_stop_nonaligned_aa_difference() checklist = [ check1, check2, check3, check4, check5, check6, check7, check8 ] ############################################################################ if verbose: print "NonFinal inwpCBG check:", checklist ############################################################################ if checklist.count(False) == 0: nonfinalPCG = PacbpCollectionGraph(crossdata={}, blastmatrix=PCG._blastmatrix) # place all PacbPORFs in the nonfinalPCG for (pacbpkey, nodeQ, nodeS), pacbporf in lastInwpCBG.pacbps.iteritems(): # add to noncodingnongenePCG nonfinalPCG.add_node(nodeQ) nonfinalPCG.add_node(nodeS) nonfinalPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) nonfinalPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG, (pacbpkey, nodeQ, nodeS)) # return nonfinalPCG return nonfinalPCG else: return False
def detect_and_remove_utrornonegene_inwpcbgs(inwpcbgs, PCG, verbose=True): """ """ # if empty list or empty PCG provided: return False if not inwpcbgs or not PCG or PCG.node_count() == 0: return False # MAKE SHURE ALL Orfs HAVE PREDICTED TSS SITES!! for inwpCBG in inwpcbgs: inwpCBG.scan_orfs_for_pssm_tss(min_pssm_score=TSS_MIN_PSSM_SCORE) # get target organism identifier target = inwpcbgs[0]._get_target_organism() # detect inwpCBGs which are most likely 5' and 3' non coding or non gene ncng_5p_list = assign_utrornongene5p_inwpcbgs(inwpcbgs) ncng_3p_list = assign_utrornongene3p_inwpcbgs(inwpcbgs) ncng_list = ncng_5p_list ncng_list.extend(ncng_3p_list) # return False in no inwpcbgs are assigned if not ncng_list: return False # get list of inwpCBGs that are NON ncng correct_inwpcbg_list = [] check_str_list = [] for discrinwpCBG in ncng_list: check_str_list.append(str(discrinwpCBG)) for inwpcbg in inwpcbgs: if str(inwpcbg) not in check_str_list: correct_inwpcbg_list.append(inwpcbg) # get all pacbp keys belonging to noncoding / nongene inwpcbgs ONLY ncng_pacbpkeys = [] for ncnginwpCBG in ncng_list: for pacbpkey in ncnginwpCBG.pacbps.keys(): # check if this pacbpkey is occuring in a non-removed inwpCBG is_occurring_in_correct_inwpcbg = False for inwp in correct_inwpcbg_list: if pacbpkey in inwp.pacbps.keys(): is_occurring_in_correct_inwpcbg = True break # if is_occurring_in_correct_inwpcbg, continue and do not delete if is_occurring_in_correct_inwpcbg: continue # store to gtgdiscrepancy_pacbpkeys when not stored already if pacbpkey not in ncng_pacbpkeys: ncng_pacbpkeys.append(pacbpkey) # place all ncng_pacbpkeys and PacbPORFs in the noncodingnongenePCG # and, at the same time, remove from the main PCG noncodingnongenePCG = PacbpCollectionGraph(crossdata={}, blastmatrix=PCG._blastmatrix) for key in ncng_pacbpkeys: (pacbpkey, nodeQ, nodeS) = key pacbporf = PCG.pacbps[key] # add to noncodingnongenePCG noncodingnongenePCG.add_node(nodeQ) noncodingnongenePCG.add_node(nodeS) noncodingnongenePCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) noncodingnongenePCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG, key) # return noncodingnongenePCG return noncodingnongenePCG
def detect_and_remove_gtgdiscrepancy(inwpcbgs, PCG, GENE_IDENTIFIER_SET, verbose=True): """ """ # if empty list or empty PCG provided: return False if not inwpcbgs or not PCG or PCG.node_count() == 0: return False # get target organism identifier target = inwpcbgs[0]._get_target_organism() # Make *the* GTG of the strongest X informant species # X depends on the maximum number of gene informants (GENE_IDENTIFIER_SET); # unigene informants are not taken into account here. # X is defined here by: # -- at least 3 informants (for very small number of informants) # -- optimally half of the total numers of informants # -- at most 8 informants min_gtg_node_count = 3 + 1 max_gtg_node_count = 8 + 1 gtg_size = min([(len(GENE_IDENTIFIER_SET) - 1) / 2, max_gtg_node_count]) gtg_size = max([min_gtg_node_count, gtg_size]) btGTG = pcg2gtg_by_bitscore(PCG, target, identifier_list=GENE_IDENTIFIER_SET) ntGTG = pcg2gtg_by_identity(PCG, target, identifier_list=GENE_IDENTIFIER_SET) # TEMP solution because OrganismGraph != OrganismStarGraph # make bitscore ordered list of nodes bitscore_ordered_nodes = [] for (tNode, iNode), wt in btGTG.weights.iteritems(): if tNode == target: bitscore_ordered_nodes.append((wt, iNode)) bitscore_ordered_nodes.sort() #if verbose: print "btGTG::", bitscore_ordered_nodes while ntGTG.node_count() > gtg_size: # next line causes errors because OrganismGraph != OrganismStarGraph # this causes the target node in rare cases to be assigned as the weakest node # informant = btGTG.weakest_connected_node() (wt, informant) = bitscore_ordered_nodes.pop(0) btGTG.del_node(informant) ntGTG.del_node(informant) if verbose: print "btGGT.weakest_connected_node() ==", informant, btGTG.get_ordered_nodes( ) ############################################################################ if verbose: print "ntGTG:", ntGTG.get_ordered_nodes(), for node in ntGTG.get_ordered_nodes(): if node == target: continue print "%1.2f" % ntGTG.weights[(target, node)], print "" ############################################################################ # detect inwpCBGs which are probably the result of intron alignments gtgdiscrepancy_internal_inwpcbg_list = assign_internal_nongene_alignments( inwpcbgs, ntGTG) # detect inwpCBGs with strong discrepancy to this GTG gtgdiscrepancy_inwpcbg_list = assign_gtgdiscrepancy_inwpcbgs( inwpcbgs, ntGTG) # merge both lists if gtgdiscrepancy_internal_inwpcbg_list: if not gtgdiscrepancy_inwpcbg_list: gtgdiscrepancy_inwpcbg_list.extend( gtgdiscrepancy_internal_inwpcbg_list) else: for inwpcbg in gtgdiscrepancy_internal_inwpcbg_list: check_str = str(inwpcbg) if check_str not in [ str(gtgdiscrCBG) for gtgdiscrCBG in gtgdiscrepancy_inwpcbg_list ]: gtgdiscrepancy_inwpcbg_list.append(inwpcbg) if not gtgdiscrepancy_inwpcbg_list: return False # get list of inwpCBGs that have NO discrepancy correct_inwpcbg_list = [] check_str_list = [] for discrinwpCBG in gtgdiscrepancy_inwpcbg_list: check_str_list.append(str(discrinwpCBG)) for inwpcbg in inwpcbgs: if str(inwpcbg) not in check_str_list: correct_inwpcbg_list.append(inwpcbg) # get all pacbp keys belonging to gtgdiscrepancy inwpcbgs ONLY gtgdiscrepancy_pacbpkeys = [] for discrinwpCBG in gtgdiscrepancy_inwpcbg_list: for pacbpkey in discrinwpCBG.pacbps.keys(): # check if this pacbpkey is occuring in a non-removed inwpCBG is_occurring_in_correct_inwpcbg = False for inwp in correct_inwpcbg_list: if pacbpkey in inwp.pacbps.keys(): is_occurring_in_correct_inwpcbg = True break # if is_occurring_in_correct_inwpcbg, continue and do not delete if is_occurring_in_correct_inwpcbg: continue # store to gtgdiscrepancy_pacbpkeys when not stored already if pacbpkey not in gtgdiscrepancy_pacbpkeys: gtgdiscrepancy_pacbpkeys.append(pacbpkey) # place all gtgdiscrepancy_pacbpkeys and PacbPORFs in the gtgdiscrepancyPCG # and, at the same time, remove from the main PCG gtgdiscrepancyPCG = PacbpCollectionGraph(crossdata={}, blastmatrix=PCG._blastmatrix) for key in gtgdiscrepancy_pacbpkeys: if key not in PCG.pacbps.keys(): # !?!? TODO why not present in the PCG !?!?! # anyway, continue here to avoid KeyError # This PacbPORF was to be deleted rigth here, # so it is not an extreme disaster. But... scary ;-) continue (pacbpkey, nodeQ, nodeS) = key pacbporf = PCG.pacbps[key] # add to gtgdiscrepancyPCG gtgdiscrepancyPCG.add_node(nodeQ) gtgdiscrepancyPCG.add_node(nodeS) gtgdiscrepancyPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) gtgdiscrepancyPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG, key) # return gtgdiscrepancyPCG return gtgdiscrepancyPCG
def detect_and_remove_single_nonfirst_inwpcbg(inwpcbgs, PCG, GENE_IDENTIFIER_SET, verbose=False): """ Allow deletion of a very shitty, single inwpCBG from the start of the list """ # we need at least 2 inwpCBGs in order to remove one of them if len(inwpcbgs) <= 1: return False firstInwpCBG = inwpcbgs[0] nextInwpCBG = inwpcbgs[1] firstNodeList = [ firstInwpCBG.get_organism_nodes(org)[0] for org in\ firstInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ] nextNodeList = [ nextInwpCBG.get_organism_nodes(org)[0] for org in\ nextInwpCBG.organism_set().intersection(GENE_IDENTIFIER_SET) ] # identical nodes -> do not delete. Only go for very obvious things if Set(firstNodeList).intersection(nextNodeList): return False ntdistdict = firstInwpCBG.nt_spacing_between_codingblocks([nextInwpCBG]) tcodedistdict = firstInwpCBG.tcode_spacing_between_codingblocks( [nextInwpCBG]) # make a long list of checks which should be True in case # firstInwpCBG is *NOT* the first exon of this gene structure check1 = nextInwpCBG.count_orfs_labeled_as_annotated_exon() >\ firstInwpCBG.count_orfs_labeled_as_annotated_exon() check2 = nextInwpCBG.get_bitscore() > firstInwpCBG.get_bitscore() check3 = len(nextNodeList) > len(firstNodeList) check4 = float(firstInwpCBG.count_orfs_labeled_as_annotated_exon()) /\ float(len(GENE_IDENTIFIER_SET)) <= 0.33 if ntdistdict: check5 = sum(ntdistdict.values())/float(len(ntdistdict)) >\ MIN_INTERGENIC_NT_LENGTH else: check5 = False if tcodedistdict: check6 = sum(tcodedistdict.values())/float(len(tcodedistdict)) <\ TCODE_MAX_NONCODING else: check6 = False check7 = nextInwpCBG.count_orfs_labeled_as_first_exon() >=\ firstInwpCBG.count_orfs_labeled_as_first_exon() check8 = firstInwpCBG.count_orfs_labeled_as_annotated_exon() == 0 check9 = nextInwpCBG.get_average_upstream_methionine_pssm_score() >\ firstInwpCBG.get_average_upstream_methionine_pssm_score() checklist = [ check1, check2, check3, check4, check5, check6, check7, check8, check9 ] ############################################################################ if verbose or True: print "NonFirst inwpCBG check:", checklist ############################################################################ if checklist.count(False) <= 1: nonfirstPCG = PacbpCollectionGraph(crossdata={}, blastmatrix=PCG._blastmatrix) # place all PacbPORFs in the nonfirstPCG for (pacbpkey, nodeQ, nodeS), pacbporf in firstInwpCBG.pacbps.iteritems(): # add to noncodingnongenePCG nonfirstPCG.add_node(nodeQ) nonfirstPCG.add_node(nodeS) nonfirstPCG.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) nonfirstPCG.pacbps[(pacbpkey, nodeQ, nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG, (pacbpkey, nodeQ, nodeS)) # return nonfirstPCG return nonfirstPCG else: return False
def blastanalysescbgjunction( gsg, prevCBG, nextCBG, omit_cbg_orfs=False, omit_non_cbg_orfs=False, extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS, omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK, verbose=False): """ """ ############################################################ if verbose: stw = StopWatch('blastanalysescbgjunction') stw.start() ############################################################ orfs = {} if not omit_cbg_orfs: # gather Orfs from prevCBG and nextCBG for org, orflist, in prevCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org, orf.id)] = orf for org, orflist, in nextCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org, orf.id)] = orf ############################################################ if verbose: print stw.lap(), "orfs (1):", len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # create masked fasta database in a dict fastadbmfa = parseFasta( create_hmmdb_for_neighbouring_cbgs( gsg.input, prevCBG, nextCBG, omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction, ).split("\n")) ############################################################ if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa) ############################################################ # remove ORFs that do not belong to prevCBG and nextCBG, # or that DO belong to prevCBG and nextCBG, or neither fastaheaders = fastadbmfa.keys() for header in fastaheaders: org, orfid = header.split("_orf_") orfid = int(orfid) node = (org, orfid) # check for the omit_non_cbg_orfs criterion add_orf = False if omit_non_cbg_orfs: if node not in orfs: del (fastadbmfa[header]) else: add_orf = True # check for the omit_cbg_orfs criterion if omit_cbg_orfs and node in orfs: del (fastadbmfa[header]) if add_orf: # get this Orf and add to orfs orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid) ############################################################ if verbose: print stw.lap(), "fasta db (2):", len(fastadbmfa) print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys()) ############################################################ ############################################################ if verbose: print stw.lap(), "orfs (2):", len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # no query/sbjct range left at all if not fastadbmfa: return [] # check if all organisms are still covered orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()]) if orgSet.symmetric_difference(gsg.organism_set()): return [] # create !single! fasta database fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa" writeMultiFasta(fastadbmfa, fastadbname) formatdb(fname=fastadbname) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps ############################################################ if verbose: print stw.lap(), "blastp starting" ############################################################ for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism(): for nodeQ, orfQ in orfs.iteritems(): # only blast the (masked) Orfs of orgQ if prevCBG.organism_by_node(nodeQ) != orgQ: continue # get the masked protein sequence of this orfObj header = orgQ + "_orf_" + str(orfQ.id) # check if key exists in fastadbmfa. In a case where # an Orf is masked out completely, it is absent here! if not fastadbmfa.has_key(header): continue protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)] # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id, protseq, fastadbname, extra_blastp_params=extra_blastp_params) # omit empty blast records if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # get sbjct Org and Orf identifiers _orgS, _orfSid = alignment.title.replace(">", "").split("_orf_") if _orgS != orgS: continue nodeS = (_orgS, int(_orfSid)) orfS = orfs[nodeS] # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp), orfQ, orfS) ################################################################ if verbose: print pacbporf, orgQ, orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ################################################################ # create nodes; ( Organism Identifier, Orf Identifier ) nodeQ = (orgQ, orfQ.id) nodeS = (orgS, orfS.id) uqkey = pacbporf.construct_unique_key(nodeQ, nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf ############################################################ if verbose: print stw.lap(), "blastp done" ############################################################ # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([fname + ".*" for fname in blastdbs.values()]) # check if all Organism/Gene identifiers are covered in PacbPs if not pacbpcol.organism_set_size() == gsg.organism_set_size(): return [] # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol # In dpcpacbpcol the actual PacbPORFs are stores & kept, # whereas pacbpcol itself is splitted in CBGs (which # function does not yet (!?) take the actual pacbps into account) dpcpacbpcol.add_nodes(pacbpcol.get_nodes()) for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore, length, orfQid, orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore) ################################################################ if verbose: print pacbpcol print "PCG bitscores:", print[p.bitscore for p in dpcpacbpcol.pacbps.values()] print "PCG nodes:", dpcpacbpcol.get_ordered_nodes() ################################################################ #### do some transformations on the pacbpcol ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1) ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs( #### edges=gsg.node_count()-1 , max_missing_edges=0 ) ##### convert to list of CBGs and do some transformations ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={}) ####cbgList.remove_all_but_complete_cbgs() ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT) ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) ####cbgList.remove_cbgs_without_omsr() ####cbgList.update_edge_weights_by_minimal_spanning_range() ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True) min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2]) pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity) max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3 splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={}) cbgList.remove_all_but_cbgs() cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.make_pacbps_for_missing_edges() cbgList.remove_all_but_complete_cbgs() cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_list_by_attribute(order_by='total_weight', reversed=True) # and create_cache() for these CBGs for cbg in cbgList: cbg.create_cache() #################################################################### if verbose: print stw.lap(), "CBGs created", len(cbgList) for newcbg in cbgList: print "new:", newcbg #################################################################### # return list with CBGs return cbgList.codingblockgraphs
def get_reverse_cbg(cbg, frame, verbose=False): """ Get the ReversecomplementCodingBlockGraph in requested frame of this CBG @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph to reversecomplement @type frame: integer @param frame: 0,1 or 2 @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes @rtype: ReversecomplementCodingBlockGraph or None @return: ReversecomplementCodingBlockGraph (when existing) or None """ min_orf_length = (cbg.omsrlength() / 2) * 3 orfs = get_reverse_strand_orfsets(cbg, frame, min_orf_length=min_orf_length) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps for org in orfs.keys(): fname = "%s_reversecbg_%s.mfa" % (org, cbg.barcode()) writeMultiFasta(orfs[org].tofastadict(), fname) multifastas[org] = fname ######################################################################## if verbose: print "ORFS:", org, len(orfs[org].orfs), print[len(o.protein_sequence) for o in orfs[org].orfs] ######################################################################## revpacbps = {} for orgQ, orgS in cbg.pairwisecrosscombinations_organism(): # create blastdb if it does not exist yet if not blastdbs.has_key(orgS): formatdb(fname=multifastas[orgS]) blastdbs[orgS] = multifastas[orgS] revpacbporfs = {} for orfQ in orfs[orgQ].orfs: # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id, orfQ.protein_sequence, dbname="./" + blastdbs[orgS]) if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # obtain coordinates from sbjct orf identifier orfS = orfs[orgS].get_orf_by_id( alignment.title.replace(">", "")) # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # skip if hsp is very short if len(hsp.query) < cbg.omsrlength() / 2: continue # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp), orfQ, orfS) ################################################################ if verbose: print pacbporf, orgQ, orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ###pacbporf.print_protein_and_dna() ################################################################ nodeQ = (orgQ, orfQ.protein_startPY) nodeS = (orgS, orfS.protein_startPY) uqkey = pacbporf.construct_unique_key(nodeQ, nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([fname + ".*" for fname in blastdbs.values()]) if not pacbpcol.organism_set_size() == cbg.organism_set_size(): # no CBG on the reverse strand return None # ``deepcopy`` PacbPcollection dpcpacbpcol.add_nodes(pacbpcol.get_nodes()) for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore, length, orfQid, orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore) ############################################################################ if verbose: print pacbpcol, "bitscores:", print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()] ############################################################################ # do some transformations on the pacbpcol pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() - 1) splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=cbg.node_count() - 1, max_missing_edges=0) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={}) cbgList.remove_all_but_complete_cbgs() cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_list_by_attribute(order_by='total_weight', reversed=True) ############################################################################ if verbose: for revcbg in cbgList: print "revCBG:", revcbg ############################################################################ if not cbgList: # no CBG on the reverse strand return None else: # return the highest scoring CBG as a ReversecomlementCodingBlockGraph return CodingBlockGraph2ReversecomlementCodingBlockGraph( cbgList.codingblockgraphs[0])
def blastanalysescbgjunction(gsg,prevCBG,nextCBG, omit_cbg_orfs = False, omit_non_cbg_orfs = False, extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS, omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK, verbose=False): """ """ ############################################################ if verbose: stw = StopWatch('blastanalysescbgjunction') stw.start() ############################################################ orfs = {} if not omit_cbg_orfs: # gather Orfs from prevCBG and nextCBG for org,orflist, in prevCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org,orf.id)] = orf for org,orflist, in nextCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org,orf.id)] = orf ############################################################ if verbose: print stw.lap(), "orfs (1):",len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # create masked fasta database in a dict fastadbmfa = parseFasta( create_hmmdb_for_neighbouring_cbgs( gsg.input,prevCBG,nextCBG, omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction, ).split("\n") ) ############################################################ if verbose: print stw.lap(), "fasta db (1):",len(fastadbmfa) ############################################################ # remove ORFs that do not belong to prevCBG and nextCBG, # or that DO belong to prevCBG and nextCBG, or neither fastaheaders = fastadbmfa.keys() for header in fastaheaders: org,orfid = header.split("_orf_") orfid = int(orfid) node = (org,orfid) # check for the omit_non_cbg_orfs criterion add_orf = False if omit_non_cbg_orfs: if node not in orfs: del(fastadbmfa[header]) else: add_orf = True # check for the omit_cbg_orfs criterion if omit_cbg_orfs and node in orfs: del(fastadbmfa[header]) if add_orf: # get this Orf and add to orfs orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid) ############################################################ if verbose: print stw.lap(), "fasta db (2):",len(fastadbmfa) print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys()) ############################################################ ############################################################ if verbose: print stw.lap(), "orfs (2):",len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # no query/sbjct range left at all if not fastadbmfa: return [] # check if all organisms are still covered orgSet = Set([ k.split("_orf_")[0] for k in fastadbmfa.keys()]) if orgSet.symmetric_difference(gsg.organism_set()): return [] # create !single! fasta database fastadbname = prevCBG.barcode()+"_"+nextCBG.barcode()+".mfa" writeMultiFasta(fastadbmfa,fastadbname) formatdb(fname=fastadbname) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps ############################################################ if verbose: print stw.lap(), "blastp starting" ############################################################ for orgQ,orgS in prevCBG.pairwisecrosscombinations_organism(): for nodeQ,orfQ in orfs.iteritems(): # only blast the (masked) Orfs of orgQ if prevCBG.organism_by_node(nodeQ) != orgQ: continue # get the masked protein sequence of this orfObj header = orgQ+"_orf_"+str(orfQ.id) # check if key exists in fastadbmfa. In a case where # an Orf is masked out completely, it is absent here! if not fastadbmfa.has_key(header): continue protseq = fastadbmfa[orgQ+"_orf_"+str(orfQ.id)] # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id,protseq,fastadbname, extra_blastp_params=extra_blastp_params) # omit empty blast records if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # get sbjct Org and Orf identifiers _orgS,_orfSid = alignment.title.replace(">","").split("_orf_") if _orgS != orgS: continue nodeS = (_orgS,int(_orfSid)) orfS = orfs[nodeS] # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp),orfQ,orfS) ################################################################ if verbose: print pacbporf, orgQ,orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ################################################################ # create nodes; ( Organism Identifier, Orf Identifier ) nodeQ = ( orgQ, orfQ.id ) nodeS = ( orgS, orfS.id ) uqkey = pacbporf.construct_unique_key(nodeQ,nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf ############################################################ if verbose: print stw.lap(), "blastp done" ############################################################ # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([ fname+".*" for fname in blastdbs.values()]) # check if all Organism/Gene identifiers are covered in PacbPs if not pacbpcol.organism_set_size() == gsg.organism_set_size(): return [] # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol # In dpcpacbpcol the actual PacbPORFs are stores & kept, # whereas pacbpcol itself is splitted in CBGs (which # function does not yet (!?) take the actual pacbps into account) dpcpacbpcol.add_nodes( pacbpcol.get_nodes() ) for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore,length,orfQid,orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore) ################################################################ if verbose: print pacbpcol print "PCG bitscores:", print [ p.bitscore for p in dpcpacbpcol.pacbps.values() ]
def detect_and_remove_synteny(inwpcbgs,PCG,GENE_IDENTIFIER_SET,verbose=True): """ """ MIN_OBSERVED_VS_EXPECTED_RATIO = 0.20 observed_organism_subcombis = [] syntenic_subinwpcbgs = [] # detect syntenic genes in MAIN inwpCBGs, # without taking strongest informants by GTG analyses syntenic_inwpcbgs = assign_syntenic_inwpcbgs(inwpcbgs) for syntinwpcbg in syntenic_inwpcbgs: syntenic_subinwpcbgs.append(syntinwpcbg) for inwpCBG in inwpcbgs: # omit inwpCBGs with annotated exons/orfs if inwpCBG.count_orfs_labeled_as_annotated_exon() >= 2: continue target = inwpCBG._get_target_organism() # make a (artificially fully connected) GeneTreeGraph gtg = GeneTreeGraph() gtg.add_node(target) for (pacbpkey,nodeQ,nodeS),pacbporf in inwpCBG.pacbps.iteritems(): orgS = inwpCBG.organism_by_node(nodeS) if orgS not in GENE_IDENTIFIER_SET: continue gtg.add_node(orgS) for (pacbpkey,nodeQ,nodeS),pacbporf in inwpCBG.pacbps.iteritems(): orgQ = inwpCBG.organism_by_node(nodeQ) orgS = inwpCBG.organism_by_node(nodeS) if orgS not in GENE_IDENTIFIER_SET: continue gtg.add_edge( orgQ, orgS, wt = pacbporf.bitscore ) # make artificially missed edges between the informants for org in inwpCBG.organism_set(): if org not in [orgQ,orgS] and org in GENE_IDENTIFIER_SET: if gtg.has_edge( orgS, org ) and\ gtg.weights[(orgS, org)] > pacbporf.bitscore: gtg.set_edge_weight(orgS,org,wt = pacbporf.bitscore) else: gtg.add_edge( orgS, org, wt = pacbporf.bitscore ) # omit (nearly) empty genetreegraphs if gtg.node_count() <= 1: continue # remove (much) weaker connected nodes as expected from the gtg while gtg.get_nodes() and MIN_OBSERVED_VS_EXPECTED_RATIO >\ min( [ gtg.get_node_weighted_connectivity_observed_vs_expected(node) for node in gtg.get_nodes() ]): node = gtg.weakest_connected_node() gtg.del_node(node) # check if already tested before; present in observed_organism_subcombis if gtg.get_ordered_nodes() in observed_organism_subcombis: continue # store to already tested organism subcombinations observed_organism_subcombis.append( gtg.get_ordered_nodes() ) # create a subPCG of these organisms subPCG = PacbpCollectionGraph(crossdata={}, blastmatrix=PCG._blastmatrix) for (pacbpkey,nodeQ,nodeS), pacbporf in PCG.pacbps.iteritems(): (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS if orgQ not in gtg.get_nodes(): continue if orgS not in gtg.get_nodes(): continue subPCG.add_node(nodeQ) subPCG.add_node(nodeS) subPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) subPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # make inwpCBGs of this subPCG subinwpcbgs = PCG2inwpCBGS(subPCG) # check if there are subinwpcbgs if not subinwpcbgs: continue ######################################################################## #if verbose: # print "subPCG organism set:", gtg.get_ordered_nodes() # print_inwpcbgstructure(subinwpcbgs,gtg.get_ordered_nodes()) ######################################################################## # create a subInwardsPointingCodingBlockGraph of these organisms #subinwpCBG = InwardsPointingCodingBlockGraph() #for (pacbpkey,nodeQ,nodeS), pacbporf in inwpCBG.pacbps.iteritems(): # (orgQ,orfQid),(orgS,orfSid) = nodeQ,nodeS # if orgQ not in gtg.get_nodes(): continue # if orgS not in gtg.get_nodes(): continue # subinwpCBG.add_node(nodeQ) # subinwpCBG.add_node(nodeS) # subinwpCBG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) # subinwpCBG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # detect syntenic genes in this subinwpcbgs syntenic_inwpcbgs = assign_syntenic_inwpcbgs(subinwpcbgs) for syntinwpcbg in syntenic_inwpcbgs: syntenic_subinwpcbgs.append(syntinwpcbg) #################################################################### if verbose: print "SYNTENIC!!", syntinwpcbg, syntinwpcbg.get_ordered_nodes() for subCBG in subinwpcbgs: print "syntenic in:", subCBG, subCBG.get_ordered_nodes() #################################################################### if not syntenic_subinwpcbgs: return False # cleanup all inwpCBGs from the syntenic subInwpCBGs syntenic_pacbpkeys = [] for syntinwpcbg in syntenic_subinwpcbgs: node_set = syntinwpcbg.node_set() for inwpCBG in inwpcbgs: if not node_set.difference(inwpCBG.node_set()): for pacbpkey in inwpCBG.pacbps.keys(): if pacbpkey not in syntenic_pacbpkeys: syntenic_pacbpkeys.append(pacbpkey) # place all syntenic_pacbpkeys and PacbPORFs in the syntenicPCG # and, at the same time, remove from the main PCG syntenicPCG = PacbpCollectionGraph(crossdata={},blastmatrix=PCG._blastmatrix) for key in syntenic_pacbpkeys: (pacbpkey,nodeQ,nodeS) = key pacbporf = PCG.pacbps[key] # add to syntenicPCG syntenicPCG.add_node(nodeQ) syntenicPCG.add_node(nodeS) syntenicPCG.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) syntenicPCG.pacbps[(pacbpkey,nodeQ,nodeS)] = pacbporf # remove from main PCG _delete_pacbp(PCG,key) # return syntenicPCG return syntenicPCG