def start_top_hits(num_hits, stage, smotif_index): """ generate run seq, a seq list of pairs of indexes of profiles for job scheduling """ map_route = [] ss_profiles = io.readPickle("ss_profiles.pickle") if os.path.isfile("contacts_route.pickle"): map_route = io.readPickle("contacts_route.pickle") elif os.path.isfile("pcs_route.pickle"): map_route = io.readPickle("pcs_route.pickle") map_route_alt = io.readPickle("pcs_route_alt.pickle") elif os.path.isfile("rdc_route.pickle"): map_route = io.readPickle("rdc_route.pickle") map_route_alt = io.readPickle("rdc_route_alt.pickle") alt_smotif_defs = map_route_alt[smotif_index] top_hits = [] top_hit_file = str(smotif_index - 1) + "_refined_tophits.gzip" if os.path.isfile(top_hit_file): top_hits = io.readGzipPickle(top_hit_file) print "loading from prevously assembled refined_tophits.pickle file" print "# hits :", len(top_hits) else: top_hit_file = str(smotif_index - 1) + "_tophits.gzip" if os.path.isfile(top_hit_file): top_hits = io.readGzipPickle(top_hit_file) print "loading from prevously assembled tophits.pickle file" print "# hits :", len(top_hits) else: print "No previous tophits file found, Generating a new one" return "exception" if not top_hits: return False, False run_seq = [] for next_smotif in alt_smotif_defs: print next_smotif direction = next_smotif[-1] if direction == 'left': next_ss_list = ss_profiles[next_smotif[0]] else: next_ss_list = ss_profiles[next_smotif[1]] for i in range(len(top_hits)): for j in range(len(next_ss_list)): run_seq.append([i, j, next_smotif]) return run_seq, smotif_index
def getPreviousSmotif(index, next_index): """ Modified for the Alt_Smotifs from the original version :param index: :param next_smotif: :param next_index: :return: """ t_file = str(next_index - 1) + "_refined_tophits.gzip" if os.path.isfile(t_file): top_hits = io.readGzipPickle(t_file) # Read in previous index hits else: t_file = str(next_index - 1) + "_tophits.gzip" if os.path.isfile(t_file): top_hits = io.readGzipPickle(t_file) # Read in previous index hits else: return False return top_hits[index]
def makeTopPickle(previous_smotif_index, num_hits, stage): """ Concatenate data from all of the threads, organize, remove redundancies, rank and extract top hits as defined :param previous_smotif_index: :param num_hits: :param stage: :return: """ hits = [] # regex = str(previous_smotif_index) + "_*_*.pickle" regex = str(previous_smotif_index) + "_*_*.gzip" file_list = glob.glob(regex) for f in file_list: t_hits = io.readGzipPickle(f) for t_hit in t_hits: hits.append(t_hit) """ identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter RDC_filter, NOE_filter """ new_dict = collections.defaultdict(list) pcs_filter = False contact_filter = False rdc_filter = False noe_filter = False ref_rmsd_filter = False global_noe_filter = False for hit in hits: # thread_data contains data from each search and filter thread. for data_filter in hit: if data_filter[0] == 'PCS_filter': pcs_filter = True pcs_data = data_filter Nchi = getNchiSum(pcs_data, stage) # new_dict.setdefault(Nchi, []).append(entry) new_dict[Nchi].append(hit) if data_filter[0] == 'Evofilter': contact_filter = True new_dict[data_filter[1]].append(hit) if data_filter[0] == 'Ref_RMSD': ref_rmsd_filter = True new_dict[data_filter[1]].append(hit) """ if data_filter[0] == 'RDC_filter': rdc_filter = True rdc_data = data_filter Nchi = rdcSumChi(rdc_data, stage) for filter in hit: if filter[0] == 'NOE_filter': noe_filter = True noe_fmeasure = filter[1] no_of_noes = filter[2] if stage == 99: Nchi = Nchi else: # Nchi = Nchi / math.pow(10, noe_fmeasure * no_of_noes) Nchi = Nchi / math.pow(10, noe_fmeasure ) #new_dict[Nchi].append(hit) if filter[0] == 'GlobalNoe_filter': global_noe_filter = True noe_percent = filter[1] total_no_of_noes = filter[2] Nchi = total_no_of_noes new_dict[Nchi].append(hit) if not noe_filter: new_dict[Nchi].append(hit) """ # ************************************************ # Exclude the redundant entries and rank top hits # ************************************************ keys = new_dict.keys() keys.sort() if contact_filter or global_noe_filter: # Contact filter data should be as high as possible print "Global NOEs are used" keys.reverse() # Exclude the redundant data. # non_redundant = {} non_redundant = collections.defaultdict(list) seqs = [] smotif_seq = '' Nchi = 0.0 count_hits = 0 for i in range(0, len(keys)): entries = new_dict[keys[i]] for entry in entries: for ent in entry: if ent[0] == 'smotif': name = ent[1][0] if ent[0] == 'seq_filter': seq_filter = ent smotif_seq = seq_filter[1] if ent[0] == 'PCS_filter': pcs_data = ent Nchi = getNchiSum(pcs_data, stage) if ent[0] == 'Evofilter': Nchi = ent[1] if ent[0] == 'Ref_RMSD': # Ref_RMSD overwrites the previous Nchis Nchi = ent[1] """ if ent[0] == 'RDC_filter': rdc_data = ent Nchi = rdcSumChi(rdc_data, stage) if noe_filter: for ent in entry: if ent[0] == 'NOE_filter': noe_fmeasure = ent[1] no_of_noes = ent[2] if stage == 99: Nchi = Nchi else: # Nchi = Nchi / math.pow(10, noe_fmeasure * no_of_noes) Nchi = Nchi / math.pow(10, noe_fmeasure) #Nchi = Nchi / math.pow(10, noe_fmeasure * 10) else: Nchi = rdcSumChi(rdc_data, stage) """ if smotif_seq not in seqs: seqs.append(smotif_seq) # non_redundant.setdefault(Nchi, []).append(entry) non_redundant[Nchi].append(entry) count_hits += 1 if count_hits >= num_hits: break # Rank top hits and dump the data keys = non_redundant.keys() keys.sort() if contact_filter and not pcs_filter: keys.reverse() dump_pickle = [] print "Dumping data to disk" count_top_hits = 0 while (True): for key in keys: if key == 999.999: # Do not work on these entries continue entries = non_redundant[key] for entry in entries: dump_pickle.append(entry) print "final sele", entry[0][1][0][0], key count_top_hits += 1 if count_top_hits >= num_hits: break if count_top_hits >= num_hits: break else: print "could only extract ", count_top_hits break io.dumpPickle(str(previous_smotif_index) + "_tophits.pickle", dump_pickle) print "actual number in top hits ", len(dump_pickle) return range(count_top_hits)