def forMatthieu2(file_features='../resultData/features_on_films/all_distances_whole_dataonly.pkl', file_labels='../resultData/features_on_films/labelsKM_whole_k8.pkl', file_coordinates='../resultData/features_on_films/coordinates_1000first.pkl', out_feature_file='../resultData/features_on_films/traj_cluster{}_trajectory_features.csv', out_coordinate_file='../resultData/features_on_films/traj_cluster{}_trajectory_coordinates.csv'): small_nr=None; small_coordinates=None f=open(file_features) data=pickle.load(f); f.close() data=data[0][:356905] r=np.hstack((data[:,:len(featuresNumeriques)], data[:,featuresSaved.index('mean persistence'), np.newaxis], data[:, featuresSaved.index('mean straight'), np.newaxis])) f=open(file_labels) labels=pickle.load(f); f.close() labels=labels[0][:356905] f=open(file_coordinates) coordinates=np.array(pickle.load(f));f.close() i=0 for k in [7,0,3,5,4,2,6,1]:#range(begin_, num_clusters):#ORDRE UTILISE POUR LE PAPIER ISBI where_=np.where(np.array(labels)==k)[0] np.random.shuffle(where_) f=open(out_feature_file.format(i), 'w') writer=csv.writer(f) writer.writerows(r[where_[:1000]]) f.close() f=open(out_coordinate_file.format(i), 'w') writer=csv.writer(f) for el in coordinates[where_[:1000]]: writer.writerow(zip(el[:,0], el[:,1])) f.close() i+=1
def findingMeans(exp_list, feature1, feature2, folder= '/share/data20T/mitocheck/tracking_results/', filename="hist_tabFeatures_{}.pkl", plot=True, result=None): ''' For interesting plots: getting means and medians for a couple of features, then getting wells for extreme values ''' if result is None: result=np.zeros(shape=len(exp_list, 4)) for k, exp in enumerate(exp_list): pl,w=exp; print k try: f=open(os.path.join(folder, pl, filename.format(w))) arr,_,_=pickle.load(f) f.close() except OSError: print "no ", pl, w else: result[k]=[nanmean(arr[:,featuresSaved.index(feature1)]), nanmedian(arr[:,featuresSaved.index(feature1)]),\ nanmean(arr[:,featuresSaved.index(feature2)]), nanmedian(arr[:,featuresSaved.index(feature2)])] f=open('result__{}{}.pkl'.format(feature1, feature2), 'w') pickle.dump(result, f); f.close() if plot: f=p.figure() ax=f.add_subplot(121) ax.scatter(result[:,0], result[:,2]); ax.set_title('Means'); ax.set_xlabel(feature1); ax.set_ylabel(feature2) ax.axhline(scoreatpercentile(result[:,2],90)); ax.axhline(scoreatpercentile(result[:,2],10)) ax.axvline(scoreatpercentile(result[:,0],90)); ax.axvline(scoreatpercentile(result[:,0],10)) ax=f.add_subplot(122) ax.scatter(result[:,1], result[:,3]); ax.set_title('Medians'); ax.set_xlabel(feature1); ax.set_ylabel(feature2) ax.axhline(scoreatpercentile(result[:,3],90)); ax.axhline(scoreatpercentile(result[:,3],10)) ax.axvline(scoreatpercentile(result[:,1],90)); ax.axvline(scoreatpercentile(result[:,1],10)) p.show() mean_result=[0,0,0,0] mean_result[1] = exp_list[np.where((result[:,0]>scoreatpercentile(result[:,0],90)) & ((result[:,2])>scoreatpercentile(result[:,2],90)))] mean_result[2] = exp_list[np.where((result[:,0]>scoreatpercentile(result[:,0],90)) & ((result[:,2])<scoreatpercentile(result[:,2],10)))] mean_result[0] = exp_list[np.where((result[:,0]<scoreatpercentile(result[:,0],10)) & ((result[:,2])>scoreatpercentile(result[:,2],90)))] mean_result[3] = exp_list[np.where((result[:,0]<scoreatpercentile(result[:,0],10)) & ((result[:,2])<scoreatpercentile(result[:,2],10)))] return mean_result
def getData(self, histDataAsWell): ''' Ici sur toutes les experiences dans self.expList on construit l'histogramme de toutes les features numeriques ''' histDict = defaultdict(list) _,r, _, _,_, length, _, _, _ = histConcatenation(self.settings.data_folder, self.expList, self.settings.mitocheck_file, self.settings.quality_control_file, verbose=self.verbose) for feature in self.currInterestFeatures: for i in range(len(length)): histDict[feature].append(r[np.sum(length[:i]):np.sum(length[:i+1]),featuresSaved.index(feature)]) histogrammes, bins = computingBins(histDict, [self.bin_size for k in range(len(self.currInterestFeatures))], self.bin_type, iter_=self.iter_ ) return histogrammes, bins
def _findNumerical(self, data, percentile, how_many): index=featuresSaved.index(self.feature_name) scoreD = scoreatpercentile(data[:,index], percentile) down=np.where(data[:,index]<=scoreD) scoreU = scoreatpercentile(data[:,index], 100-percentile) up=np.where(data[:,index]>=scoreU) med0, med1=scoreatpercentile(data[:,index], 48), scoreatpercentile(data[:,index], 52) ctrl0 = np.where(data[:,index]>=med0) ctrl1=np.where(data[:, index]<=med1) ctrl=filter(lambda x: x in ctrl1[0], ctrl0[0]) if self.verbose: print "Feature {}".format(self.feature_name) print "{} retrieved, percentile {}, value {}".format(up[0], 100-percentile, scoreU) print "{} retrieved, percentile {}, value {}".format(down[0],percentile, scoreD) print "{} retrieved, percentile {}, value {}".format(np.array(ctrl), percentile, med0) return down[0], up[0], ctrl
def wellToPlot(self, length,who, all_, outputFolder, median=False): assert getpass.getuser()!='lalil0u', 'You are not running this on the right computer' index=featuresSaved.index(self.feature_name) folder='/share/data20T/mitocheck/tracking_results' basename = 'traj' resultCour=[]; sizes=[0] valCour=[] #if median: basename+='_median' for el in all_: trajectories=all_[el] print el for plate, well in trajectories: print plate, well if self.verbose: print "Taking care of plate {}, well {}".format(plate, well) f=open(os.path.join(folder,plate, 'hist_tabFeatures_{}.pkl'.format(well))) tab, coord, _=pickle.load(f); f.close() resultCour.extend(np.array(coord)[trajectories[(plate, well)]]) valCour.extend(tab[trajectories[(plate, well)], index]) sizes.append(len(resultCour)) XX=[]; YY=[] for k in range(len(resultCour)): X=np.array(resultCour[k][1]); X-=X[0]; XX.append(X) Y=np.array( resultCour[k][2]); Y-=Y[0]; YY.append(Y) minx,maxx = min([min(X) for X in XX]), max([max(X) for X in XX]) miny,maxy = min([min(Y) for Y in YY]), max([max(Y) for Y in YY]) #saving trajectories f=open('trajectories_{}.pkl'.format(self.feature_name), 'w') pickle.dump([all_, resultCour, valCour, sizes],f) f.close() i=0 for el in all_: for k in range(sizes[i], min(sizes[i]+50,sizes[i+1])): plotAlignedTraj(XX[k], YY[k], minx, maxx, miny, maxy, show=False, name=os.path.join(outputFolder, '{}_{}_{}_{}.png'.format(basename,el, self.feature_name, k)), val=valCour[k]) i+=1 return 1#_writeXml(plate, well, resultCour)
def collectingData(iter_, expList, debut, fin): folder = "/cbio/donnees/aschoenauer/workspace2/Xb_screen/resultData/experiment_clustering/" histDict = defaultdict(list) _, r, _, who, ctrlStatus, length, genes, siRNAs, _ = histConcatenation( "/share/data20T/mitocheck/tracking_results", expList[debut:fin], "/cbio/donnees/aschoenauer/workspace2/Xb_screen/data/mitocheck_siRNAs_target_genes_Ens72.txt", "/cbio/donnees/aschoenauer/workspace2/Xb_screen/data/qc_export.txt", ) for i in range(len(length)): for k, feature in enumerate(interestFeatures): histDict[feature].append(r[np.sum(length[:i]) : np.sum(length[: i + 1]), featuresSaved.index(feature)]) f = open("../resultData/experiment_clustering/distExp_ctrl_quantile_10.pkl") bins = pickle.load(f) f.close() histogrammes, bins = computingBins(histDict, [10 for k in range(16)], "quantile", previous_binning=bins) f = open(os.path.join(folder, "data_{}.pkl".format(iter_)), "w") pickle.dump((histogrammes, who, ctrlStatus, genes, siRNAs), f) f.close()
def _dataPrep(self, pcaParameter): histDict = defaultdict(list) ctrlExp = appendingControl(self.expList) ctrlExp = countingDone(ctrlExp) np.random.shuffle(ctrlExp) ctrlExp = ctrlExp[: int(0.2 * len(self.expList))] if self.verbose: print ctrlExp self.expList.extend(ctrlExp) _, r, _, _, _, length, _, _, _ = histConcatenation( self.settings.data_folder, self.expList, self.settings.mitocheck_file, self.settings.quality_control_file, verbose=self.verbose, ) for i in range(len(length)): for k, feature in enumerate(self.currInterestFeatures): histDict[feature].append(r[np.sum(length[:i]) : np.sum(length[: i + 1]), featuresSaved.index(feature)]) f = open( os.path.join(self.settings.result_folder, "distExp_ctrl_{}_{}.pkl".format(self.bins_type, self.bin_size)) ) bins = pickle.load(f) f.close() histogrammes, bins = computingBins( histDict, [self.bin_size for k in range(len(self.currInterestFeatures))], self.bins_type, previous_binning=bins, ) print histogrammes.shape return histogrammes, bins
def replotHeatmap(folder, data_filename, indices, outputfile,action='hierarchical', level=0.4,trad=False, num_clusters=8, labels_filename='labelsKM_whole_k{}.pkl', pcaed_filename='all_distances_whole2_pcaed.pkl', with_timelength=False): f=open(os.path.join(folder,data_filename)) data=pickle.load(f); f.close(); r=data[0] featuresToKeep=['effective space length','mean squared displacement', 'entropy1','diffusion adequation','movement type', 'signed turning angle','corrected straightness index', 'mean straight'] if featuresToKeep is not None: r=np.hstack((r[:,featuresSaved.index(feat), np.newaxis] for feat in featuresToKeep)) features=featuresToKeep else: features=list(featuresNumeriques); features.append('mean persistence'); features.append('mean straight') r=np.hstack((r[:,:len(featuresNumeriques)], r[:,featuresSaved.index('mean persistence'), np.newaxis], r[:, featuresSaved.index('mean straight'), np.newaxis])) fL=list(features) if with_timelength: time_length=r[-1] r=np.hstack((r, np.array(time_length)[:,np.newaxis])); fL.append('time length') mean_=np.mean(r,0) for k,feature in enumerate(features): print feature, mean_[k] nr=(r-mean_)/np.std(r,0) #nr=np.hstack((nr, indices[:,np.newaxis])); fL.append('hierarchical cluster') small_nr=None if action=='hierarchical': num_clusters=len(np.bincount(indices)); begin_=1 print 'Going for hierarchical clustering (ward, euclidean) with {} clusters'.format(num_clusters-1) elif action=='kmeans': print 'Going for mini batch k-means with {} clusters'.format(num_clusters); begin_=0 try: f=open(os.path.join(folder, labels_filename.format(num_clusters)), 'r') labels, percentages, who, length=pickle.load(f); f.close() except OSError: print 'File Error ', os.path.join(folder, labels_filename.format(num_clusters)) else: indices=labels # f=open(os.path.join(folder, pcaed_filename), 'r') # _,pcaed_data=pickle.load(f); f.close() # # model = MiniBatchKMeans(n_clusters=num_clusters, batch_size = 2000, init='k-means++',n_init=1000,max_iter=1000, max_no_improvement=100, compute_labels = True) # indices = model.fit(pcaed_data[:,:7]) # indices=indices.labels_ for k in [7,0,3,5,4,2,6,1]:#range(begin_, num_clusters):#ORDRE UTILISE POUR LE PAPIER ISBI mais pas la petite heatmap [7,0,3,5,4,2,6,1]:# where_=np.where(np.array(indices)==k)[0] np.random.shuffle(where_) small_nr = np.vstack((small_nr, nr[where_[:1000]])) if small_nr is not None else nr[where_[:1000]] print small_nr.shape print 'Showing trajectory clusters' heatmap(small_nr.T,fL, range(small_nr.shape[0]), None, None, None, None, color_gradient='OrRd', filename=outputfile+'TRAJ', trad=False, save=False) if action=='kmeans': # num_experiments=np.where(np.array(genes)=='ctrl')[0][0] heatmap(percentages, who,range(begin_, num_clusters), 'ward', 'ward', 'euclidean', 'euclidean', color_gradient='red_white_blue', filename=outputfile+'MOV', trad=False, save=False, level=level) # heatmap(percentages[num_experiments:], genes[num_experiments:],range(begin_, num_clusters), None, 'ward', None, 'euclidean', # color_gradient='red_white_blue', filename=outputfile+'CTRL', trad=False, save=False, level=level) return
def exploitingKMeans_wModel(model, data, mean, std, pca_std, pca, num_PC=7): data=np.hstack((data[:,:len(featuresNumeriques)], data[:, featuresSaved.index('mean straight'), np.newaxis])) pca_data=pca.transform((data-mean)/std)/pca_std return np.bincount(model.predict(pca_data[:,:num_PC]), minlength=8)
simulateur = TrajectorySimulator(settings_filename='tracking/settings/settings_simulator_14_10_20.py') simulateur('simulated_trajectories', 0, options.movement_type_index, "hist_tabFeatures{}_W{}.pkl".format(options.iter, options.movement_type_index)) f=open('../resultData/simulated_traj/trajectories_for_clustering/hist_tabFeatures{}_W{}.pkl'.format(options.iter, options.movement_type_index)) tabFeatures, _, _ = pickle.load(f) r2 = histLogTrsforming(tabFeatures) print r2.shape, 'not normalized' r2=r2[:,:-1] if np.any(np.isnan(r2)): print 'Deleting nan values' r2=np.delete(r2, np.where(np.isnan(r2))[0], 0) #r=(r2-np.mean(r2,0))/np.std(r2,0) r=np.hstack((r2[:,:len(featuresNumeriques)], r2[:, featuresSaved.index('mean straight'), np.newaxis])) print r.shape #print 'computing bins' #histogrammeMatrix = computingBins(histNC, mat_hist_sizes[0]) print 'saving' f=open('../resultData/simulated_traj/trajectories_for_clustering/data_sim_traj{}{}.pkl'.format(options.iter, options.movement_type_index), 'w') pickle.dump(r, f); f.close() # #if __name__ == '__main__': # p=PlateSimulator(settings_filename="tracking/settings/settings_simulator_14_10_20.py") # # p() #