def data_preprocessing_descriptive(Extracted_Features,Coma_Features,Corrected_Features): lvltrace.lvltrace("LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive") tools.separate_coma(Extracted_Features,Coma_Features) for root, dirs, files in os.walk(Coma_Features): for i in files: if not i.startswith('.'): input_i=Coma_Features+i output_i=Corrected_Features+i lines=tools.file_lines(input_i) ncol=tools.file_col(input_i) if lines >= 2: file = open(output_i, "w") writer=csv.writer(file, lineterminator='\t') data = np.genfromtxt(input_i,delimiter=',') X = data[1:, 2:] neuron_type = np.genfromtxt(input_i,delimiter=',',dtype=None) y = neuron_type[:, 0] # (class) neuron_name = np.genfromtxt(input_i,delimiter=',',dtype=None) z = neuron_name[:, 1] # Neuron names features = np.genfromtxt(input_i,delimiter=',',dtype=None) w = features[0, :] # features names #Replace missing values 'nan' by column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) # Output replacement "Nan" values Y=imp.transform(X) #print i #print Y.shape, y.shape,z.shape #print Y.shape[1] #################### for line in xrange(Y.shape[0]+1): for colonne in xrange(Y.shape[1]+2): if line == 0: if colonne == 0: file.write("%s\t"%y[line]) else: if colonne == 1: file.write("%s\t"%z[line]) else: file.write("%s\t"%w[colonne]) else: if colonne == 0: file.write("%s\t"%y[line]) else: if colonne == 1: file.write("%s\t"%z[line]) else: file.write("%f\t"%Y[line-1,colonne-2]) file.write("\n") ######################### else: print "Only one morphology !!!" file.close() lvltrace.lvltrace("LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive")
def separate_coma(input,output): for root, dirs, files in os.walk(input): for i in files: if not i.startswith('.'): input_i=input+i output_i=output+i file = open(output_i, "w") writer=csv.writer(file, lineterminator=',') lines=tools.file_lines(input_i)+1 ncol=tools.file_col(input_i) for i in xrange(lines): for j in xrange(ncol): file.write("%s,"%tools.read_csv_tab(input_i,j,i)) file.write("\n") file.close()
def preprocessing_module(Extracted_Features, Coma_Features, Corrected_Features, Norm, ontology): # Replace tab separated csv into comma separated csv and replace categorial variables into iteration lvltrace.lvltrace("LVLEntree dans preprocessing_module data_preproc") onto = open(ontology, "w") writer = csv.writer(onto, lineterminator=',') class_number = 1 onto.write("Iteration,Class,Class_number,Neuron_name\n") Iteration = 1 for root, dirs, files in os.walk(Extracted_Features): for i in files: if not i.startswith('.'): #print i input_i = Extracted_Features + i output_i = Coma_Features + i file = open(output_i, "w") writer = csv.writer(file, lineterminator=',') lines = tools.file_lines(input_i) + 1 ncol = tools.file_col(input_i) - 1 for line in xrange(lines): for col in xrange(ncol): if line == 0: if col == 1: # Skipping neuron names laurent = 1 else: file.write( "%s," % tools.read_csv_tab(input_i, col, line)) else: if col == 0: # replace class names by an integer file.write("%i," % class_number) else: if col == 1: #print "skip neuron name" onto.write("%i,%s,%i,%s\n" % (Iteration, i, class_number, tools.read_csv_tab( input_i, col, line))) Iteration = Iteration + 1 else: file.write( "%s," % tools.read_csv_tab(input_i, col, line)) file.write("\n") file.close() class_number = class_number + 1 if lines > 3: input_file = Coma_Features + i data = np.loadtxt( input_file, delimiter=',', usecols=range(ncol - 1), skiprows=1) # ncol-1 because we skip the class names X = data[:, :ncol] y = data[:, 0].astype(np.int) # Labels (class) #Replace missing values 'nan' by column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) # Output replacement "Nan" values Y = imp.transform(X) #Data Standardization if Norm == 'normalize': Z = preprocessing.normalize(Y, axis=0, norm='l2') # Normalize else: if Norm == 'binarize': binarizer = preprocessing.Binarizer().fit( Y) # Binarize for Bernoulli Z = binarizer.transform(Y) else: if Norm == 'standardize': min_max_scaler = preprocessing.MinMaxScaler( ) # Normalize the data to [0,1] Z = min_max_scaler.fit_transform(Y) else: Z = preprocessing.scale(Y) #Scaling #Create new files with corrected and standardized data output_file = Corrected_Features + i file = open(output_file, "w") writer = csv.writer(file, lineterminator=',') for line_1 in xrange(lines - 1): for col_1 in xrange(ncol - 1): if col_1 == 0: file.write("%s," % y[line_1]) else: file.write("%f," % Z[line_1, col_1]) file.write("\n") file.close() else: laurent = 1 onto.close() lvltrace.lvltrace("LVLSortie dans preprocessing_module data_preproc")
def data_preprocessing_descriptive(Extracted_Features, Coma_Features, Corrected_Features): lvltrace.lvltrace( "LVLEntree dans data_preprocessing_descriptive dans preproc_descriptive" ) tools.separate_coma(Extracted_Features, Coma_Features) for root, dirs, files in os.walk(Coma_Features): for i in files: if not i.startswith('.'): input_i = Coma_Features + i output_i = Corrected_Features + i lines = tools.file_lines(input_i) ncol = tools.file_col(input_i) if lines >= 2: file = open(output_i, "w") writer = csv.writer(file, lineterminator='\t') data = np.genfromtxt(input_i, delimiter=',') X = data[1:, 2:] neuron_type = np.genfromtxt(input_i, delimiter=',', dtype=None) y = neuron_type[:, 0] # (class) neuron_name = np.genfromtxt(input_i, delimiter=',', dtype=None) z = neuron_name[:, 1] # Neuron names features = np.genfromtxt(input_i, delimiter=',', dtype=None) w = features[0, :] # features names #Replace missing values 'nan' by column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) # Output replacement "Nan" values Y = imp.transform(X) #print i #print Y.shape, y.shape,z.shape #print Y.shape[1] #################### for line in xrange(Y.shape[0] + 1): for colonne in xrange(Y.shape[1] + 2): if line == 0: if colonne == 0: file.write("%s\t" % y[line]) else: if colonne == 1: file.write("%s\t" % z[line]) else: file.write("%s\t" % w[colonne]) else: if colonne == 0: file.write("%s\t" % y[line]) else: if colonne == 1: file.write("%s\t" % z[line]) else: file.write("%f\t" % Y[line - 1, colonne - 2]) file.write("\n") ######################### else: print "Only one morphology !!!" file.close() lvltrace.lvltrace( "LVLSortie dans data_preprocessing_descriptive dans preproc_descriptive" )
def descriptive_multi_cores(data): fig, ax = plt.subplots() lines=tools.file_lines(data[3]) ncol=tools.file_col(data[3])-1 outputs_files=data[1]+data[2] if lines==1: file = open(outputs_files, "w") writer=csv.writer(file, lineterminator=',') b=[[0 for x in xrange(6)] for x in xrange(ncol-2)] normality=[0 for x in xrange(1)] file.write("Variables,Class,N,Mean,Std_Dev,Variance,Max,Min,Coeff_Var,Interquartile,Normality_distrib,Confident_intervals_left,Confident_intervals_right\n") for col in xrange(ncol-2): a = tools.read_float_tab(data[3],col+2,1) b[col][0]=1 b[col][1]=float(a) b[col][2]=0 b[col][3]=0 b[col][4]=float(a) b[col][5]=float(a) file.write("%s,"%data[4][col+2]) file.write("%s,"%(os.path.splitext(data[2])[0])) file.write("%f,"%b[col][0]) file.write("%f,"%b[col][1]) file.write("%f,"%b[col][2]) file.write("%f,"%b[col][3]) file.write("%f,"%b[col][4]) file.write("%f,"%b[col][5]) file.write("100,") file.write("0,") file.write("0,") file.write("0,") file.write("0,\n") file.close() else: file = open(outputs_files, "w") writer=csv.writer(file, lineterminator=',') b=[[0 for x in xrange(11)] for x in xrange(ncol-2)] file.write("Variables,Class,N,Mean,Std_Dev,Variance,Max,Min,Coeff_Var,Interquartile,Normality_distrib,Confident_intervals_left,Confident_intervals_right\n") a=[0 for x in xrange(lines)] for col in xrange(ncol-2): for j in xrange(lines): a[j]=tools.read_float_tab(data[3],col+2,j) b[col][0]=float(len(a)) b[col][1]=float(np.mean(a)) b[col][2]=float(np.std(a, dtype=float)) b[col][3]=float(np.var(a, dtype=float)) b[col][4]=float(max(a)) b[col][5]=float(min(a)) if np.around(b[col][2],decimals=12) != 0.0: if float(len(a)) < 3: b[col][6]=float(abs((np.std(a)/np.mean(a))*100)) file.write("%s,"%data[4][col+2]) file.write("%s,"%(os.path.splitext(data[2])[0])) file.write("%f,"%b[col][0]) file.write("%f,"%b[col][1]) file.write("%f,"%b[col][2]) file.write("%f,"%b[col][3]) file.write("%f,"%b[col][4]) file.write("%f,"%b[col][5]) file.write("%f,"%b[col][6]) file.write("0,") file.write("0,") file.write("0,") file.write("0,\n") else: b[col][6]=float(abs((np.std(a)/np.mean(a))*100)) X=sort(a) upperQuartile = stats.scoreatpercentile(X,.75) lowerQuartile = stats.scoreatpercentile(X,.25) IQR = upperQuartile - lowerQuartile b[col][7]=float(IQR) normality=stats.shapiro(a) b[col][8]=float(normality[1]) b[col][9]=np.mean(a)-1.96*(np.std(a)/math.sqrt(len(a))) b[col][10]=np.mean(a)+1.96*(np.std(a)/math.sqrt(len(a))) file.write("%s,"%data[4][col+2]) file.write("%s,"%(os.path.splitext(data[2])[0])) file.write("%f,"%b[col][0]) file.write("%f,"%b[col][1]) file.write("%f,"%b[col][2]) file.write("%f,"%b[col][3]) file.write("%f,"%b[col][4]) file.write("%f,"%b[col][5]) file.write("%f,"%b[col][6]) file.write("%f,"%b[col][7]) file.write("%f,"%b[col][8]) file.write("%f,"%b[col][9]) file.write("%f,\n"%b[col][10]) if normality[1] >= 0.05: norm_distrib = np.linspace(-150,150,100) ax.set_title('Normality of %s class features'%os.path.splitext(data[2])[0]) ax.plot(norm_distrib,mlab.normpdf(norm_distrib,np.mean(a),math.sqrt(np.var(a))),label=data[4][col+2],ms=10, alpha=0.3) ax.legend(loc=2, ncol=1, bbox_to_anchor=(0, 0, 1, 0.7),fancybox=True,shadow=False,fontsize=5) ax.grid(color='lightgray', alpha=0.5) ax.patch.set_facecolor('white') else: b[col][6]=0 X=sort(a) upperQuartile = stats.scoreatpercentile(X,.75) lowerQuartile = stats.scoreatpercentile(X,.25) IQR = upperQuartile - lowerQuartile b[col][7]=float(IQR) b[col][8]=0 b[col][9]=np.mean(a)-1.96*(np.std(a)/math.sqrt(len(a))) b[col][10]=np.mean(a)+1.96*(np.std(a)/math.sqrt(len(a))) file.write("%s,"%data[4][col+2]) file.write("%s,"%(os.path.splitext(data[2])[0])) file.write("%f,"%b[col][0]) file.write("%f,"%b[col][1]) file.write("%f,"%b[col][2]) file.write("%f,"%b[col][3]) file.write("%f,"%b[col][4]) file.write("%f,"%b[col][5]) file.write("%f,"%b[col][6]) file.write("%f,"%b[col][7]) file.write("%f,"%b[col][8]) file.write("%f,"%b[col][9]) file.write("%f,\n"%b[col][10]) file.close() display_d3(fig) html = mpld3.fig_to_d3(fig) html_normality=data[1]+data[2]+".html" normality_display = open(html_normality, "w") normality_display.write("%s"%html) normality_display.close() plt.close()
def preprocessing_module(Extracted_Features,Coma_Features,Corrected_Features, Norm,ontology): # Replace tab separated csv into comma separated csv and replace categorial variables into iteration lvltrace.lvltrace("LVLEntree dans preprocessing_module data_preproc") onto = open(ontology, "w") writer=csv.writer(onto, lineterminator=',') class_number = 1 onto.write("Iteration,Class,Class_number,Neuron_name\n") Iteration=1 for root, dirs, files in os.walk(Extracted_Features): for i in files: if not i.startswith('.'): #print i input_i=Extracted_Features+i output_i=Coma_Features+i file = open(output_i, "w") writer=csv.writer(file, lineterminator=',') lines=tools.file_lines(input_i)+1 ncol=tools.file_col(input_i)-1 for line in xrange(lines): for col in xrange(ncol): if line == 0: if col == 1: # Skipping neuron names laurent=1 else: file.write("%s,"%tools.read_csv_tab(input_i,col,line)) else: if col == 0: # replace class names by an integer file.write("%i,"%class_number) else: if col == 1: #print "skip neuron name" onto.write("%i,%s,%i,%s\n"%(Iteration,i,class_number,tools.read_csv_tab(input_i,col,line))) Iteration=Iteration+1 else: file.write("%s,"%tools.read_csv_tab(input_i,col,line)) file.write("\n") file.close() class_number = class_number + 1 if lines > 3 : input_file=Coma_Features+i data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1),skiprows=1) # ncol-1 because we skip the class names X = data[:, :ncol] y = data[:, 0].astype(np.int) # Labels (class) #Replace missing values 'nan' by column mean imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) # Output replacement "Nan" values Y=imp.transform(X) #Data Standardization if Norm == 'normalize': Z=preprocessing.normalize(Y, axis=0, norm='l2') # Normalize else: if Norm == 'binarize': binarizer=preprocessing.Binarizer().fit(Y) # Binarize for Bernoulli Z = binarizer.transform(Y) else: if Norm == 'standardize': min_max_scaler = preprocessing.MinMaxScaler() # Normalize the data to [0,1] Z=min_max_scaler.fit_transform(Y) else: Z=preprocessing.scale(Y) #Scaling #Create new files with corrected and standardized data output_file=Corrected_Features+i file = open(output_file, "w") writer=csv.writer(file, lineterminator=',') for line_1 in xrange(lines-1): for col_1 in xrange(ncol-1): if col_1==0: file.write("%s,"%y[line_1]) else: file.write("%f,"%Z[line_1,col_1]) file.write("\n") file.close() else: laurent=1 onto.close() lvltrace.lvltrace("LVLSortie dans preprocessing_module data_preproc")