def _split_data(self, precision=6): """precision is to get same results as in the original perl script""" #FIXME:: mask = self.gs.Flag == 0 self.user_data_clean = self.user_data[mask].copy() print( 'Splitting the user data set and removing flagged data (%s out of %s)' % (self.gs.shape[0] - mask.sum(), self.gs.shape[0])) self.gs_clean = self.gs[mask].copy() # local aliases gs = self.gs_clean user_data = self.user_data_clean pb = progress_bar(self.Ntf, interval=1) for tf_index in range(1, self.Ntf + 1): this_tf = 'TF_%s' % tf_index tf_gs = gs.query("Id == @this_tf").Answer tf_user = user_data.query("TF_Id == @this_tf").Signal_Mean df = pd.concat([tf_gs, tf_user], axis=1) df.to_csv(self._setfile(tf_index, 'Data'), index=False, sep='\t', header=False, float_format="%f") pb.animate(tf_index)
def compute_statistics(self): """Returns final results of the user predcition :return: a dataframe with various metrics for each transcription factor. Must call :meth:`score` before. """ data = { 'Pearson': [], 'Spearman': [], 'Pearson_Log': [], "AUROC_8mer": [], "AUPR_8mer": [], "AUROC_probe": [], "AUPR_probe": [] } pb = progress_bar(self.Ntf, interval=1) for tf_index in range(1, self.Ntf + 1): dfdata = pd.read_csv(self._setfile(tf_index, "Data"), sep='\t', header=None) pearson = dfdata.corr('pearson').ix[0, 1] spearman = dfdata.corr('spearman').ix[0, 1] pearsonLog = np.log10(dfdata).corr('pearson').ix[0, 1] data['Pearson'].append(pearson) data['Pearson_Log'].append(pearsonLog) data['Spearman'].append(spearman) dvdata = self._dvs[tf_index] r = ROCDiscovery(dvdata.values) rocdata = r.get_statistics() auroc = r.compute_auc(roc=rocdata) aupr = r.compute_aupr(roc=rocdata) data['AUROC_8mer'].append(auroc) data['AUPR_8mer'].append(aupr) dvdata = self._dvps[tf_index] r = ROCDiscovery(dvdata.values) rocdata = r.get_statistics() auroc = r.compute_auc(roc=rocdata) aupr = r.compute_aupr(roc=rocdata) data['AUROC_probe'].append(auroc) data['AUPR_probe'].append(aupr) pb.animate(tf_index) df = pd.DataFrame(data) df = df[[ 'Pearson', u'Spearman', u'Pearson_Log', u'AUROC_8mer', u'AUPR_8mer', u'AUROC_probe', u'AUPR_probe' ]] return df
def _preprocessing(self): """Create temporary files for before further processing :return: nothing """ # Read file octomers gold standard filename = self.directory + os.sep + '8mers_gs.txt' self.octomers_gs = pd.read_csv(filename, sep='\t', header=None) # Read file octomers filename = self.directory + os.sep + 'all_8mers.txt' self.octomers = pd.read_csv(filename, sep='\t', header=None) # contains reverse complemtn self.octomers.columns = ['octomer', 'octomerRC'] # Read probes gs filename = self.directory + os.sep + 'probe35_gs.txt' self.probes_gs = pd.read_csv(filename, header=None, sep='\t') self.probes_gs.columns = ['Id', 'Sequence'] # reads probes (sequences) print('Reading probes') filename = self.directory + os.sep + 'probes35.txt' # just one column so no need for a separator probes = pd.read_csv(filename) # Extract information (first and third column of pred.txt) df = self.user_data[['TF_Id', 'Signal_Mean']].copy() df['Signal_Mean'] = df['Signal_Mean'].map(lambda x: round(x, 6)) # data.txt is paste of probes35.txt and val.txt data = pd.concat([probes, df], axis=1) # Creates probes/TF_1.dat that contains the sequence from the GS and the answer from the user # for each TF print('Creating probes/TF_1.csv + sorting') pb = progress_bar(self.Ntf, interval=1) for i in range(1, self.Ntf + 1): # could use a groupby here ? faster maybe tag = 'TF_%s' % i sequence = data[['Sequence']].ix[self.gs.Id == tag] answer = data.Signal_Mean[data.TF_Id == tag] df = pd.concat([sequence, answer], axis=1) try: df.sort_values(by=['Signal_Mean', 'Sequence'], ascending=[False, False], inplace=True) except: df.sort(columns=['Signal_Mean', 'Sequence'], ascending=[False, False], inplace=True) df['Signal_Mean'] = df['Signal_Mean'].map(lambda x: round(x, 6)) self._probes[i] = df pb.animate(i)
def get_jaccard(self, progress=True): import sklearn.metrics N = len(self.df) J = np.zeros((N,N)) from easydev import progress_bar pb = progress_bar(N) for ic, i in enumerate(self.df.index): for jc, j in enumerate(self.df.index): J[ic][jc] = sklearn.metrics.jaccard_similarity_score(self.df.ix[i], self.df.ix[j]) pb.animate(1+ic) return J
def _preprocessing(self): """Create temporary files for before further processing :return: nothing """ # Read file octomers gold standard filename = self.directory + os.sep + '8mers_gs.txt' self.octomers_gs = pd.read_csv(filename, sep='\t', header=None) # Read file octomers filename = self.directory + os.sep + 'all_8mers.txt' self.octomers = pd.read_csv(filename, sep='\t', header=None) # contains reverse complemtn self.octomers.columns = ['octomer','octomerRC'] # Read probes gs filename = self.directory + os.sep + 'probe35_gs.txt' self.probes_gs = pd.read_csv(filename, header=None, sep='\t') self.probes_gs.columns = ['Id', 'Sequence'] # reads probes (sequences) print('Reading probes') filename = self.directory + os.sep + 'probes35.txt' # just one column so no need for a separator probes = pd.read_csv(filename) # Extract information (first and third column of pred.txt) df = self.user_data[['TF_Id', 'Signal_Mean']].copy() df['Signal_Mean'] = df['Signal_Mean'].map(lambda x: round(x,6)) # data.txt is paste of probes35.txt and val.txt data = pd.concat([probes, df], axis=1) # Creates probes/TF_1.dat that contains the sequence from the GS and the answer from the user # for each TF print('Creating probes/TF_1.csv + sorting') pb = progress_bar(self.Ntf, interval=1) for i in range(1, self.Ntf+1): # could use a groupby here ? faster maybe tag = 'TF_%s' % i sequence = data[['Sequence']].ix[self.gs.Id==tag] answer = data.Signal_Mean[data.TF_Id == tag] df = pd.concat([sequence, answer], axis=1) try: df.sort_values(by=['Signal_Mean', 'Sequence'], ascending=[False, False], inplace=True) except: df.sort(columns=['Signal_Mean', 'Sequence'], ascending=[False, False], inplace=True) df['Signal_Mean'] = df['Signal_Mean'].map(lambda x: round(x,6)) self._probes[i] = df pb.animate(i)
def get_null_timecourse_model1(self, N=10000): data = self._get_random_timecourse_model1(N=N) distances = [] from easydev import progress_bar pb = progress_bar(N) for i in xrange(0,N): df = data[:,:,i] # FIXME those values 10,39 should not be hardcoded distance = self._compute_score_timecourse_model1(df, 10,39) distances.append(distance) pb.animate(i) return distances
def analyse(self): models = self.simulator.results.models self.truth_tables = {} from easydev import progress_bar pb = progress_bar(len(models.df)) for i, index in enumerate(models.df.index): reactions = models.df.ix[index][models.df.ix[index]==1] reactions = list(reactions.index) self.simulator.simulate(reactions=reactions) tt = self.simulator.simulated[self.simulator.time].flatten() self.truth_tables[index] = tt pb.animate(i+1)
def _load_complexes(self, show_progress=True): from easydev import progress_bar import time pb = progress_bar(len(self.df.complexAC)) complexes = {} self.logging.info("Loading all details from the IntactComplex database") for i, identifier in enumerate(self.df.complexAC): res = self.webserv.details(identifier) complexes[identifier] = res if show_progress: pb.animate(i+1, time.time()-pb.start) self._complexes = complexes
def get_null_parameters_model1(self, N=10000): """Returns score distribution (parameter model1)""" df = self._get_random_parameters_model1(N=N) distances =[] from easydev import progress_bar pb = progress_bar(N) for i in xrange(0, N): df1 = df.ix[i].to_frame(name='values') distance = self._compute_score_model1_parameters(df1) distances.append(distance) pb.animate(i+1) return distances
def compute_statistics(self): """Returns final results of the user predcition :return: a dataframe with various metrics for each transcription factor. Must call :meth:`score` before. """ data = {'Pearson': [], 'Spearman': [], 'Pearson_Log': [], "AUROC_8mer": [], "AUPR_8mer": [], "AUROC_probe": [], "AUPR_probe": []} pb = progress_bar(self.Ntf, interval=1) for tf_index in range(1, self.Ntf + 1): dfdata = pd.read_csv(self._setfile(tf_index, "Data"), sep='\t', header=None) pearson = dfdata.corr('pearson').ix[0,1] spearman = dfdata.corr('spearman').ix[0,1] pearsonLog = np.log10(dfdata).corr('pearson').ix[0,1] data['Pearson'].append(pearson) data['Pearson_Log'].append(pearsonLog) data['Spearman'].append(spearman) dvdata = self._dvs[tf_index] r = ROCDiscovery(dvdata.values) rocdata = r.get_statistics() auroc = r.compute_auc(roc=rocdata) aupr = r.compute_aupr(roc=rocdata) data['AUROC_8mer'].append(auroc) data['AUPR_8mer'].append(aupr) dvdata = self._dvps[tf_index] r = ROCDiscovery(dvdata.values) rocdata = r.get_statistics() auroc = r.compute_auc(roc=rocdata) aupr = r.compute_aupr(roc=rocdata) data['AUROC_probe'].append(auroc) data['AUPR_probe'].append(aupr) pb.animate(tf_index) df = pd.DataFrame(data) df = df[['Pearson', u'Spearman', u'Pearson_Log', u'AUROC_8mer', u'AUPR_8mer', u'AUROC_probe', u'AUPR_probe']] return df
def compute_distances(self, N=100, show=True, progress=True): self.init() from easydev import progress_bar distances = [] pb = progress_bar(N) for i in range(0,N): self.swap(1, inplace=True) dist = self.get_distance(self.graph) distances.append(dist) if progress:pb.animate(i) if show is True: import pylab pylab.plot(distances) pylab.grid(True) return distances
def exhaustive(self): from cno.optimisers.binary_tools import permutations # create all scores = [] sizes = [] from easydev import progress_bar N = len(self.model.reactions) pb = progress_bar(2**N) for i,this in enumerate(permutations(N)): self.simulate(self.parameters2reactions(this)) scores.append(self.score()) pb.animate(i) sizes.append(sum(this)) #self._fill_results() self.scores = scores self.sizes = sizes return scores
def compute_gtts(self): print("init R library") self._init() N = len(self.models) from easydev import progress_bar b = progress_bar(N) d = {} for i in range(0, N): res = np.array(self._get_sim(self.models.df.ix[i].values)) b.animate(i) d[i] = res df = pd.DataFrame(d).transpose() grouped = df.groupby(list(df.columns)) pylab.hist([len(this) for this in grouped.groups.values()], 100) res = {'df':df, 'simulation': d, 'grouped':grouped}# self.gtts = res return self.gtts
def _split_data(self, precision=6): """precision is to get same results as in the original perl script""" mask = self.gs.Flag == 0 self.user_data_clean = self.user_data[mask].copy() print('Splitting the user data set and removing flagged data (%s out of %s)' % (self.gs.shape[0] - mask.sum(), self.gs.shape[0])) self.gs_clean = self.gs[mask].copy() # local aliases gs = self.gs_clean user_data = self.user_data_clean pb = progress_bar(self.Ntf, interval=1) for tf_index in range(1, self.Ntf + 1): this_tf = 'TF_%s' % tf_index tf_gs = gs.query("Id == @this_tf").Answer tf_user = user_data.query("TF_Id == @this_tf").Signal_Mean df = pd.concat([tf_gs, tf_user], axis=1) df.to_csv(self._setfile(tf_index, 'Data'), index=False, sep='\t', header=False, float_format="%f") pb.animate(tf_index)
def run(self, N=10, nswap=20, verbose=True, maxstallgen=50, maxtime=60): self.sim = steady.Steady(self.real.cnograph, self.real.midas) # creates the model, preprocessed self.sim.preprocessing() from easydev import progress_bar pb = progress_bar(N) for i in xrange(0,N): self.sim = steady.Steady(self.real.cnograph, self.real.midas) self.sim.preprocessing() self.sim.model.swap_edges(nswap) self.sim.preprocessing() self.sim.optimise(verbose=verbose, reuse_best=False, maxstallgen=maxstallgen, maxtime=maxtime) score = self.sim.results.results.best_score[-1] self.best_scores.append(score) pb.animate(i+1)
def clean_models(self, tolerance=0.1): models = self.results.models.copy() models.midas = self.midas print("Found %s models within the tolerance" % len(models.df)) models.drop_duplicates() print("Removing duplicates found %s" % len(models.df)) models.drop_scores_above(tolerance=tolerance) print("Keeping within tolerance, found %s" % len(models.df)) from easydev import progress_bar pb = progress_bar(len(models)) count = 0 changed = 0 for index in models.df.index: count +=1 reactions = list(models.df.columns[models.df.ix[index]==1]) self.simulate(reactions) score = self.score() #if models.scores[index] != score: # print(index, models.scores[index], score) # compute essentiality to simplify models dummy, newr = self.essentiality(reactions, show=False) self.simulate(newr) new_score = self.score() #print score, new_score, len(reactions), len(newr) if new_score <= score: # keep that pruned model models.df.ix[index] = self.reactions2parameters(newr) models.scores.ix[index] = new_score changed += 1 else: # keep original pass pb.animate(count) print('Simplified %s %% of the model' % float(changed/float(len(models.df)))) models.drop_duplicates() print("Removing duplicaes found %s" % len(models.df)) models.drop_scores_above(tolerance=tolerance) print("Keeping within tolerance, found %s" % len(models.df)) return models
def plot_average_distance(self, repeat=10, N=100): import pandas as pd import pylab distances = [] from easydev import progress_bar pb = progress_bar(repeat) for i in range(0, repeat): distance = self.compute_distances(N=N, show=False, progress=False) distances.append(distance) pb.animate(i+1) df = pd.DataFrame(distances) pylab.clf() pylab.fill_between(range(0,N), df.mean()+df.std(), y2=df.mean()-df.std()); pylab.plot(df.mean(), 'r', lw=2) pylab.grid(True) pylab.ylim([0,1]) pylab.ylabel('similarity') return distances
def download_all_data(self): """Download all large data sets from Synapse""" pb = progress_bar(5) # load the large gold standard file from D5C2 synapse main page filename = self._download_data('DREAM5_GoldStandard_probes.zip', 'syn2898469') pb.animate(1) z = ZIP() z.loadZIPFile(filename) data = z.read('Answers.txt') self.gs = pd.read_csv(StringIO.StringIO(data), sep='\t') # download 4 other filenames from dreamtools synapse project self._download_data('all_8mers.txt', 'syn4483185') pb.animate(2) self._download_data('8mers_gs.txt', 'syn4483187') pb.animate(3) self._download_data('probe35_gs.txt', 'syn4483184') pb.animate(4) self._download_data('probes35.txt', 'syn4483183') pb.animate(5)
def download_all_data(self): """Download all large data sets from Synapse""" pb = progress_bar(5) # load the large gold standard file from D5C2 synapse main page filename = self._download_data('DREAM5_GoldStandard_probes.zip', 'syn2898469') pb.animate(1) z = ZIP() z.loadZIPFile(filename) data = z.read('Answers.txt') self.gs = pd.read_csv(BytesIO(data), sep='\t') # download 4 other filenames from dreamtools synapse project self._download_data('all_8mers.txt', 'syn4483185') pb.animate(2) self._download_data('8mers_gs.txt', 'syn4483187') pb.animate(3) self._download_data('probe35_gs.txt', 'syn4483184') pb.animate(4) self._download_data('probes35.txt', 'syn4483183') pb.animate(5)
def _processing(self): """ :return: """ ######################################## 1 Create the Out/TF_XX.dat files octomers = self.octomers.octomer octomersRC = self.octomers.octomerRC mapping1 = dict([(k,v) for k,v in zip(octomers.values, octomersRC.values)]) mapping2 = dict([(k,v) for k,v in zip(octomersRC.values, octomers.values)]) keys = tuple(sorted(octomers.values)) lm = set(octomers.values) pb = progress_bar(self.Ntf, interval=1) pb.animate(0) for tf_index in range(1, self.Ntf + 1): tf = self._probes[tf_index] tf.columns = ['Sequence', 'Score'] ids = collections.defaultdict(list) ###### TODO: most of the time is spent in the "for curR in generator" loop for seq, score in zip(tf.Sequence, tf.Score): # scan the sequence by chunk of octomers using a generator # for speed (although gain is small) generator = (seq[i:i+8] for i in xrange(0,28)) for curR in generator: if mapping1.has_key(curR) is False: curR = mapping2[curR] ids[curR].append(score) # Using a set does not help speeding up the code #for curR in generator: # if curR not in lm: # curR = mapping2[curR] # ids[curR].append(score) # now let us build the new dataframe for the indices found df = pd.DataFrame({0:[k for k in ids.keys()], 1:[np.median(v) for v in ids.values()]}) df.sort(columns=[1,0], ascending=[False, False], inplace=True) df[1] = df[1].map(lambda x: round(x,6)) df.to_csv(self._setfile(tf_index, 'Out'), sep=' ', index=False, header=None, float_format="%.6f") pb.animate(tf_index) print("ooooooooooo") ################################################# 2 create the DVP pb = progress_bar(self.Ntf, interval=1) for tf_index in range(1,self.Ntf+1): tag = 'TF_%s' % tf_index tf_probes = list(self.probes_gs.ix[self.probes_gs.groupby('Id').groups[tag]].Sequence) tf = self._probes[tf_index] dv = tf.Sequence.apply(lambda x: x in tf_probes).astype(int) self._dvps[tf_index] = dv pb.animate(tf_index) print("") ########################################################## DV gs_octomers = self.octomers_gs.copy() gs_octomers.columns = ['id', 'octomer'] pb = progress_bar(self.Ntf, interval=1) for tf_index in range(1,self.Ntf+1): tag = 'TF_%s' % tf_index tf_octomers = list(gs_octomers.ix[gs_octomers.groupby('id').groups[tag]].octomer) tf = pd.read_csv(self._setfile(tf_index, "Out"), sep=" ", header=None) tf.columns = ['Octomer', 'Score'] dv = tf.Octomer.apply(lambda x: x in tf_octomers).astype(int) # Stores the dataframe self._dvs[tf_index] = dv pb.animate(tf_index)
def run(self, eval_func, N, nswap=3, proposal=None): self.Nparameters = N results = Results(N=N, step=1) t1 = time.time() self.alpha = [] if proposal is None: proposal_parameter = [1] * self.Nparameters else: proposal_parameter = proposal[:] init_bs = proposal_parameter[:] prev_score = eval_func(proposal_parameter) # compute the score for the initial bitstring prev_bs = init_bs[:] best_score = prev_score results['best_score'] = best_score best_parameters = init_bs[:] # store the initial values results['scores'].append(prev_score) results['parameters'].append(prev_bs) from easydev import progress_bar pb = progress_bar(self.N) for i in range(0, self.N): proposal_parameter = self.swaps(best_parameters, nswap) #tup_param = tuple(proposal_parameter) #if tup_param in self._buffer.keys(): # proposal_score = self._buffer[tup_param] #else: # proposal_score = eval_func(proposal_parameter) # self._buffer[tup_param] = proposal_score proposal_score = eval_func(proposal_parameter) alpha = prev_score / proposal_score # best score is the smallests one # so alpha >1 means new proposal is better self.alpha.append(alpha) if alpha >=1: prev_score = proposal_score score = proposal_score results['parameters'].append(proposal_parameter) prev_bs = proposal_parameter[:] accepted = 1 else: r = random.uniform(0,1) if r <= alpha: prev_score = proposal_score score = proposal_score # storing results results['parameters'].append(proposal_parameter) prev_bs = proposal_parameter[:] accepted = 1 else: prev_score = prev_score # storing results score = prev_score results['parameters'].append(prev_bs) accepted = 0 self.acceptance.append(accepted) results['scores'].append(score) if score < best_score: best_score = score best_parameters = proposal_parameter[:] results['best_score'] = best_score results['best_scores'].append(best_score) results['best_score'] = best_score # just for the progres pb.animate(i) #print best_parameters del results['scores'][0] # remove first element to have a length of N value del results['parameters'][0] # remove first element to have a length of N value results['best_parameters'] = best_parameters[:] results['min_index'] = numpy.argmin(results['best_scores']) # store the index of the minimum score from the best_scores list t2 = time.time() print "simulation took", t2-t1, "seconds." self.results = results.copy()
def _processing(self): """ :return: """ ######################################## 1 Create the Out/TF_XX.dat files octomers = self.octomers.octomer octomersRC = self.octomers.octomerRC mapping1 = dict([(k, v) for k, v in zip(octomers.values, octomersRC.values)]) mapping2 = dict([(k, v) for k, v in zip(octomersRC.values, octomers.values)]) keys = tuple(sorted(octomers.values)) lm = set(octomers.values) pb = progress_bar(self.Ntf, interval=1) pb.animate(0) for tf_index in range(1, self.Ntf + 1): tf = self._probes[tf_index] tf.columns = ['Sequence', 'Score'] ids = collections.defaultdict(list) ## TODO: most of the time is spent in "for curR in generator" loop for seq, score in zip(tf.Sequence, tf.Score): # scan the sequence by chunk of octomers using a generator # for speed (although gain is small) generator = (seq[i:i + 8] for i in range(0, 28)) #for curR in generator: # if curR not in mapping1.keys(): # curR = mapping2[curR] # ids[curR].append(score) # Using a set does help speeding up the code for curR in generator: if curR not in lm: curR = mapping2[curR] ids[curR].append(score) # now let us build the new dataframe for the indices found df = pd.DataFrame({ 0: [k for k in ids.keys()], 1: [np.median(v) for v in ids.values()] }) try: df.sort_values(by=[1, 0], ascending=[False, False], inplace=True) except: df.sort(columns=[1, 0], ascending=[False, False], inplace=True) df[1] = df[1].map(lambda x: round(x, 6)) df.to_csv(self._setfile(tf_index, 'Out'), sep=' ', index=False, header=None, float_format="%.6f") pb.animate(tf_index) ################################################# 2 create the DVP pb = progress_bar(self.Ntf, interval=1) for tf_index in range(1, self.Ntf + 1): tag = 'TF_%s' % tf_index tf_probes = list(self.probes_gs.ix[self.probes_gs.groupby( 'Id').groups[tag]].Sequence) tf = self._probes[tf_index] dv = tf.Sequence.apply(lambda x: x in tf_probes).astype(int) self._dvps[tf_index] = dv pb.animate(tf_index) print("") ########################################################## DV gs_octomers = self.octomers_gs.copy() gs_octomers.columns = ['id', 'octomer'] pb = progress_bar(self.Ntf, interval=1) for tf_index in range(1, self.Ntf + 1): tag = 'TF_%s' % tf_index tf_octomers = list( gs_octomers.ix[gs_octomers.groupby('id').groups[tag]].octomer) tf = pd.read_csv(self._setfile(tf_index, "Out"), sep=" ", header=None) tf.columns = ['Octomer', 'Score'] dv = tf.Octomer.apply(lambda x: x in tf_octomers).astype(int) # Stores the dataframe self._dvs[tf_index] = dv pb.animate(tf_index)