def load_excel(filename, sheet_name, truncate_sheet): startrow = None writer = ExcelWriter(filename, engine='openpyxl') try: # try to open an existing workbook writer.book = load_workbook(filename) # get the last row in the existing Excel sheet # if it was not specified explicitly if startrow is None and sheet_name in writer.book.sheetnames: startrow = writer.book[sheet_name].max_row # truncate sheet if truncate_sheet and sheet_name in writer.book.sheetnames: # index of [sheet_name] sheet idx = writer.book.sheetnames.index(sheet_name) # remove [sheet_name] writer.book.remove(writer.book.worksheets[idx]) # create an empty sheet [sheet_name] using old index writer.book.create_sheet(sheet_name, idx) # copy existing sheets writer.sheets = {ws.title: ws for ws in writer.book.worksheets} except FileNotFoundError: # file does not exist yet, we will create it pass return (startrow, writer)
def to_excel(self, write_mode='a', highest_score=True): out_put_path = self.output_file.with_suffix('.xlsx') columns = self._init_columns() dict_data_list = self.get_list_dict_data(self.gcms, highest_score=highest_score) df = DataFrame(dict_data_list, columns=columns) if write_mode == 'a' and out_put_path.exists(): writer = ExcelWriter(out_put_path, engine='openpyxl') # try to open an existing workbook writer.book = load_workbook(out_put_path) # copy existing sheets writer.sheets = dict( (ws.title, ws) for ws in writer.book.worksheets) # read existing file reader = read_excel(out_put_path) # write out the new sheet df.to_excel(writer, index=False, header=False, startrow=len(reader) + 1) writer.close() else: df.to_excel(self.output_file.with_suffix('.xlsx'), index=False, engine='openpyxl') self.write_settings(self.output_file, self.gcms)
def writeExcelData(x,excelfile,sheetname,startrow,startcol): from pandas import DataFrame, ExcelWriter from openpyxl import load_workbook df=DataFrame(x) book = load_workbook(excelfile) writer = ExcelWriter(excelfile, engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) df.to_excel(writer, sheet_name=sheetname,startrow=startrow-1, startcol=startcol-1, header=False, index=False) writer.save() writer.close()
def appendExcel(data_frame, file_name,sc=0,use_index=False): #Define and load excel file writefilestr = file_name book = load_workbook(writefilestr) writer = ExcelWriter(writefilestr, engine='openpyxl') writer.book = book #Get list of sheets in excel file writer.sheets = dict((ws.title, ws) for ws in book.worksheets) #Write and save data to excel file data_frame.to_excel(writer,'Sheet1',index=use_index,startrow=0, startcol=sc,header=None) writer.save() writer.close()
def save_cut(cut_df, i, name): """Cutting full file into parts according to the lengh""" book = load_workbook(template_short_path) writer = ExcelWriter(output_path + u'Реестр' + name + u'{} часть_'.format(i + 1) + '{}'.format(cut_df.shape[0]) + u' записей.xlsx', engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) cut_df.to_excel(writer, u'Прил 2', startrow=26, header=False, index=False) writer.save()
def write_log_to_excel(log, xls_filename, sheet_name): from openpyxl import load_workbook from pandas import ExcelWriter, DataFrame df = DataFrame(["'" + l for l in log.strip().splitlines()]) book = load_workbook(xls_filename) writer = ExcelWriter(xls_filename, engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) df.to_excel(writer, sheet_name, index=False) writer.save()
def pandas_dbexcel(viewname, con): book = load_workbook(dbtbltype + '.xlsx') writer = ExcelWriter(dbtbltype + '.xlsx', engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) df = pd.read_sql_query("SELECT * from " + viewname, con) #verify that result of SQL query is stored in the dataframe #print(df) df.to_excel(writer, sheet_name=viewname, encoding='utf8') writer.save()
def pd_html_excel(): book = load_workbook('tables.xlsx') writer = ExcelWriter('tables.xlsx', engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) for x in range(0, len(df)): df[x].to_excel(writer, sheet_name="table_" + str(x)) writer.save() file_2.write( "Success !!! Please check tables.xlsx for extracted tables from the webpage." + '\n' + '\n')
def to_excel_ExcelWriter( df, file, rowNames = False, masterFile = '', promptIfLocked = False, xlsxEngine = 'openpyxl', #xlsxwriter ): writer = ExcelWriter(file, engine=xlsxEngine) DataFrame().to_excel(writer, list(df.keys())[0]) #ensure the file is not locked. try: #create the file even before we try to add any data to ensure it isn't locked writer.save() except: counter = 0 file_updated = False #if the file is locked/RO, sleep to give user time to close it. After 10 failures, wait for user input. while( not file_updated ): try: writer.save() file_updated = True except: print(' >> Waiting on ' + os.path.split(file)[1] + ' for 15 seconds' ) counter = counter + 1 if( counter >= 10 ): pdb.set_trace() sleep(15) if(masterFile != ''): # book = xlrd.open_workbook(masterFile) #create an archived copy of the master book = openpyxl.load_workbook(masterFile) writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) for sheetName, this_df in df.items(): if( not len( this_df ) ): print( ' ++ No data for sheet ' + sheetName + '. Moving on.') continue this_df.to_excel(writer, sheetName, index = rowNames) writer.save() return(1)
def save_all(all_df): """Concating all into one file""" book = load_workbook(template_all_path) writer = ExcelWriter( output_path + u'Реестр_полный_ {} записей.xlsx'.format(all_df.shape[0]), engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) try: del all_df['uid_first'] del all_df['uid'] del all_df['true_index'] except: pass all_df.to_excel(writer, u'Прил 2', startrow=26, header=False, index=False) writer.save()
def append_df_to_excel( df: DataFrame, filename: str, sheet_name='Sheet1', **to_excel_kwargs ) -> None: file_path = path.join(environ['USERPROFILE'], 'Desktop', filename) if not path.exists(file_path): df.to_excel( file_path, sheet_name=sheet_name, **to_excel_kwargs) return None writer = ExcelWriter(file_path, engine='openpyxl', mode='a') writer.book = load_workbook(file_path) start_row = writer.book[sheet_name].max_row writer.sheets = {ws.title: ws for ws in writer.book.worksheets} df.to_excel(writer, sheet_name, startrow=start_row, header=False, **to_excel_kwargs) writer.save() writer.close() return None
def ControlTreatedUnique(item, expgroup): controlfile = Controls[item][0] pathc = 'DEG_Identification/' + item + '/' + controlfile for expfile in expgroup: print 'Finding Common Probes for File : ' + expfile pathe = 'DEG_Identification/' + item + '/' + expfile exprobes = pd.read_excel(open(pathe, 'rb'), sheetname='ABS_CALL', index=False)['Probename'] controlprobes = pd.read_excel(open(pathc, 'rb'), sheetname='ABS_CALL', index=False)['Probename'] commonprobes = list(set(exprobes) & set(controlprobes)) expdata = pd.read_excel(open(pathe, 'rb'), sheetname='ABS_CALL', index=False) commondf = expdata[expdata['Probename'].isin(commonprobes)] writernw = ExcelWriter(pathe, engine='openpyxl') book = load_workbook(pathe) writernw.book = book writernw.sheets = dict((ws.title, ws) for ws in book.worksheets) tme = expfile.split('_')[1].split('.')[0] commondf.to_excel(writernw, 'COMMN_' + tme, index=False) writernw.save()
def main(): excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Final Project/Final_Project_Data_and_Template_Original.xlsx" sheetname = "Training Data" # df = readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=6601, endcol=17) # print df # print df[0] # print df[-1] X = np.array(readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=3169, endcol=20), dtype=np.float64) print X Xa = np.insert(X, 0, 1., axis=1) print Xa Tw = np.array(readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=3169, startcol=21, endcol=21), dtype=np.string_) print Tw T = np.array([1 if Tw[i][0] == "male" else -1 for i in range(len(Tw))], dtype=np.int32) print T Xapi = np.linalg.pinv(Xa) print Xapi W = np.dot(Xapi, T) print W T_validating = np.dot(Xa, W) print T_validating T_validating = np.array( [1 if T_validating[i] > 0 else -1 for i in range(len(T_validating))], dtype=np.int32) print T_validating bccm = np.zeros((2, 2), dtype=np.int32) for i in range(len(T)): row = 1 if T[i] == 1 else 0 col = 1 if T_validating[i] == 1 else 0 # print row, col bccm[row, col] += 1 print bccm metrics = [ np.divide(bccm[0, 0] + bccm[1, 1], bccm[0, 0] + bccm[0, 1] + bccm[1, 0] + bccm[1, 1], dtype=np.float64), np.divide(bccm[1, 1], bccm[1, 0] + bccm[1, 1], dtype=np.float64), np.divide(bccm[0, 0], bccm[0, 0] + bccm[0, 1], dtype=np.float64), np.divide(bccm[1, 1], bccm[0, 1] + bccm[1, 1], dtype=np.float64) ] print metrics from pandas import DataFrame, ExcelWriter from openpyxl import load_workbook excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Final Project/Final_Project_Data_and_Template.xlsx" book = load_workbook(excelfile) writer = ExcelWriter(excelfile, engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) sheetname = "Classifiers" # print W df = DataFrame(W) df.to_excel(writer, sheet_name=sheetname, startrow=4, startcol=0, header=False, index=False) sheetname = "Performance" # print bccm df = DataFrame(bccm) df.to_excel(writer, sheet_name=sheetname, startrow=9, startcol=2, header=False, index=False) # print metrics df = DataFrame(metrics) df.to_excel(writer, sheet_name=sheetname, startrow=7, startcol=6, header=False, index=False) writer.save() writer.close()
def main(): excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Assignment 5/Assignment_5_Data_and_Template_Original.xlsx" sheetname = "Data" xs = np.array(readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=951, endcol=2), dtype=np.float64) # print data # print data[0] # print data[-1] dims = xs.shape n = dims[0] d = dims[1] # print d c = 3 # initial random guesses for parameters np.random.seed(0) scales = np.array([np.divide(1, c, dtype=np.float64)] * c) # print scales maxs = np.amax(xs, axis=0) # print maxs mins = np.amin(xs, axis=0) # print mins mus = np.zeros((c, d)) for i in range(c): for j in range(d): mus[i, j] = (maxs[j] - mins[j]) * np.random.random() + mins[j] # print j, maxs[j], mins[j], mus[i, j] # print mus vs = np.var(xs, axis=0) # print vars sigmas = np.zeros((c, d, d)) for i in range(c): for j in range(d): sigmas[i, j, j] = vs[j] # print sigmas ps, scales, mus, sigmas = em_gmm_orig(xs, scales, mus, sigmas, iterations=10000) print scales amounts = np.round(scales * n) print amounts print mus idx_male = np.argmax(mus[:, 0]) idx_children = np.argmin(mus[:, 0]) idx_female = 3 - idx_male - idx_children print idx_male print idx_female print idx_children labelnames = ["F", "F", "F"] labelnames[idx_male] = "M" labelnames[idx_female] = "F" labelnames[idx_children] = "C" print labelnames print ps label_indices = np.argmax(ps, axis=1) print label_indices labels = np.array([labelnames[label_idx] for label_idx in label_indices]) print labels confs = np.array( [ps[idx, label_idx] for idx, label_idx in enumerate(label_indices)]) print confs totals = np.array( [amounts[idx_male], amounts[idx_female], amounts[idx_children]]) print totals # In the below plots the white dots represent the observed heights. from pandas import DataFrame, ExcelWriter from openpyxl import load_workbook excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Assignment 5/Assignment_5_Data_and_Template.xlsx" book = load_workbook(excelfile) writer = ExcelWriter(excelfile, engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) sheetname = "Results" # print labels df = DataFrame(labels) df.to_excel(writer, sheet_name=sheetname, startrow=1, startcol=0, header=False, index=False) # print confs df = DataFrame(confs) df.to_excel(writer, sheet_name=sheetname, startrow=1, startcol=1, header=False, index=False) # print totals df = DataFrame(totals) df.to_excel(writer, sheet_name=sheetname, startrow=1, startcol=5, header=False, index=False) writer.save() writer.close()
from tkinter import * from pandas import DataFrame from pandas import ExcelWriter import datetime import openpyxl from openpyxl import load_workbook root = Tk() root.title("Travel Transport Employee Clock") root.geometry("600x600") writer = ExcelWriter(r'test.xlsx', engine='openpyxl') book = load_workbook(r'test.xlsx') writer.book = book writer.sheets = {ws.title: ws for ws in book.worksheets} check1 = False check2 = False check3 = False inTime1 = 0 outTime1 = 0 inTime2 = 0 outTime2 = 0 inTime3 = 0 outTime3 = 0 #Functions def enterval(): global check1 global check2 global check3 global inTime1 global outTime1 global inTime2 global outTime2
def export_excel(data, metadata, tables, config): """ Export full data and results summary tables to Excel template Note on template file: Ideally, pivot tables would read full data (i.e. un-rounded) and band with a grouping of 0.5-99.5 at 1dB inc. Upon testing, it was found that openpyxl.load_workbook is unable to read grouped fields in pivot tables Solution will be to apply banding within Python function if possible If not possible, solution will be to include extra columns, rounded to 0 d.p. """ file_out = config["output"][0] + ".xlsm" config_out = file_out.replace(".xlsm", "_config.txt") # Create summary flags = data.filter(regex='^Flag_').columns.to_list() s1 = Series( index=[ 'Number of Time Samples', 'Number of Recorded Metrics', 'Number of User-defined Time Periods', ], data=[ len(data), len(data.columns) - len(flags), len(flags), ] ) s2 = Series( index=['Time Period ' + str(f + 1) for f in range(len(flags))], data=flags ).str.replace('Flag_', '') s3 = Series( index=[ '', 'Monitor Metadata:' ], data=[ None, None ] ) summary = concat([s1, s2, s3, metadata]) # Re-order columns cols = ['Address', 'Duration'] cols += data.filter(regex='_Main$').columns.to_list() cols += data.filter(regex='Hz$').columns.to_list() cols += flags for c in data.columns: if c not in cols: cols += [c] data_out = data[cols].copy() # Replace flags' start times with booleans for f in flags: data_out_tmp = data_out[f].dropna() if data_out_tmp.empty: data_out[f] = False else: t_start = data_out_tmp[0] data_out[f] = data_out[f].replace({t_start: True, None: False}) # Collate tables for i, t in enumerate(tables): if i not in list(range(12))[0::4]: t.index = len(t) * [''] table_main = concat([ tables[0], DataFrame(columns=tables[0].columns, index=['']), tables[1], DataFrame(columns=tables[0].columns, index=['']), tables[2], DataFrame(columns=tables[0].columns, index=['']), tables[3] ]) table_leq_spec = concat([ tables[4], DataFrame(columns=tables[4].columns, index=['']), tables[5], DataFrame(columns=tables[4].columns, index=['']), tables[6], DataFrame(columns=tables[4].columns, index=['']), tables[7] ]) table_lmax_spec = concat([ tables[8], DataFrame(columns=tables[8].columns, index=['']), tables[9], DataFrame(columns=tables[8].columns, index=['']), tables[10], DataFrame(columns=tables[8].columns, index=['']), tables[11], DataFrame(columns=tables[8].columns, index=['']), tables[12] ]) # Read template file_template = "OutputExcelTemplate_v09_20200922.xlsm" wb = load_workbook(file_template, keep_vba=True) # Write full data ExcelFormatter.header_style = None writer = ExcelWriter( file_out, engine='openpyxl', datetime_format='dd/mm/yyyy hh:mm:ss', date_format='dd/mm/yyyy' ) writer.book = wb writer.sheets = dict((ws.title, ws) for ws in wb.worksheets) print("Exporting to Excel...") # Write data summary.to_excel(writer, "Summary", header=False) data_out.to_excel(writer, "Full_Data") table_main.to_excel(writer, "Summary_Tables") table_leq_spec.to_excel(writer, "Leq_Spectral_Tables") table_lmax_spec.to_excel(writer, "Lmax_Spectral_Tables") return writer, config_out
os.system('chmod -R 777 input') #Converting the splitted PDF into list pdflist = sorted(os.listdir('input')) #Create Excel Workbook workbook = xlsxwriter.Workbook('tables.xlsx') worksheet = workbook.add_worksheet() workbook.close() os.system('chmod -R 777 tables.xlsx') #Load excel Workbook using openpyxl book = load_workbook('tables.xlsx') writer = ExcelWriter('tables.xlsx', engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) #Iterating through the PDF list for i in range(0,len(pdflist)): pdfn = pdflist[i].replace("pg_","").replace(".pdf","") #extracting table from PDF tables = camelot.read_pdf(os.path.join('input',pdflist[i])) #check if pdf contains table to be extracted if "1" in str(tables): print ("table found in " + pdflist[i]) #converting a pdf into a DataFrame tables[0].df.to_excel(writer,sheet_name="table_" + str(pdfn),index=False,header=False)
def summarize_reg(gene_set, n_data_matrix): """ The SUMMARIZE_REG operation summarizes all the data analysis results, by collecting them in convenient tables that exported locally in Excel files. :param gene_set: the set of genes of interest to summarize :param n_data_matrix: number identifying the data matrix to summarize (only 2,3 and 5 values are permitted) Example:: import genereg as gr gr.SummaryResults.summarize_reg(gene_set='DNA_REPAIR', n_data_matrix=2) gr.SummaryResults.summarize_reg(gene_set='DNA_REPAIR', n_data_matrix=3) gr.SummaryResults.summarize_reg(gene_set='DNA_REPAIR', n_data_matrix=5) """ # Check input parameters if n_data_matrix not in [2, 3, 5]: raise ValueError('Data Matrix ERROR! Possible values: {2,3,5}') # Define the model to summarize model = str(n_data_matrix) # Define the previous model to check if model == '3': previous_model = str(int(model) - 1) elif model == '5': previous_model = str(int(model) - 2) # Import the dictionary of genes of interest with their candidate regulatory genes dict_RegulGenes = pickle.load( open('./2_Regulatory_Genes/dict_RegulGenes.p', 'rb')) # Import the list of genes of interest and extract in a list the Gene Symbols of all the genes belonging to the current gene set EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx', sheetname='Sheet1', header=0, converters={ 'GENE_SYMBOL': str, 'ENTREZ_GENE_ID': str, 'GENE_SET': str }) SYMs_current_pathway = [] for index, row in EntrezConversion_df.iterrows(): sym = row['GENE_SYMBOL'] path = row['GENE_SET'] if path == gene_set: SYMs_current_pathway.append(sym) if (model == '3') or (model == '5'): # Create a list containing the Gene Symbols of the regulatory genes of the genes in the current gene set current_regulatory_genes = [] for key, value in dict_RegulGenes.items(): if key in SYMs_current_pathway: for gene in value: if gene not in current_regulatory_genes: current_regulatory_genes.append(gene) if (model == '5'): # Create a list containing the Gene Symbols of genes in the other gene sets SYMs_other_pathways = [] for index, row in EntrezConversion_df.iterrows(): sym = row['GENE_SYMBOL'] path = row['GENE_SET'] if not (path == gene_set): SYMs_other_pathways.append(sym) # Create a list containing the Gene Symbols of the regulatory genes of the genes in the other gene sets regulatory_genes_other = [] for key, value in dict_RegulGenes.items(): if key not in SYMs_current_pathway: for gene in value: if gene not in regulatory_genes_other: regulatory_genes_other.append(gene) # Create a dataframe to store final summary results of feature selection and linear regression for each gene of interest if model == '2': lr_summary_df = pd.DataFrame(index=SYMs_current_pathway, columns=[ 'Inital N° Features', 'Discarded Features', 'N° Features Selected', 'R2', 'Adj.R2' ]) else: lr_summary_df = pd.DataFrame( index=SYMs_current_pathway, columns=[ 'Inital N° Features', 'N° New Features w.r.t. Previous Model', 'Discarded Features', 'Features Available for Selection', 'N° Features Selected', 'R2', 'Adj.R2' ]) for current_gene in SYMs_current_pathway: # Import the current and, if present, the previous model of the current gene gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] == current_gene, 'ENTREZ_GENE_ID'].iloc[0] model_gene_df = pd.read_excel( './4_Data_Matrix_Construction/Model' + model + '/Gene_' + gene_ID + '_[' + current_gene + ']' + '_(' + gene_set + ')-Model_v' + model + '.xlsx', sheetname='Sheet1', header=0) if not (model == '2'): previous_model_df = pd.read_excel( './4_Data_Matrix_Construction/Model' + previous_model + '/Gene_' + gene_ID + '_[' + current_gene + ']' + '_(' + gene_set + ')-Model_v' + previous_model + '.xlsx', sheetname='Sheet1', header=0) # Extract the list of new features, added to the current model, w.r.t. the previous one if not (model == '2'): current_model_col_names = set(list(model_gene_df.columns.values)) previous_model_col_names = set( list(previous_model_df.columns.values)) new_features = list(current_model_col_names - previous_model_col_names) lr_summary_df.set_value(current_gene, 'N° New Features w.r.t. Previous Model', len(new_features)) # Import the feature selection and linear regression summary tables feature_sel_df = pd.read_excel('./5_Data_Analysis/' + gene_set + '/FeatureSelection/M' + model + '/Feature_Selection_SUMMARY.xlsx', sheetname='Sheet1', header=0) lin_reg_df = pd.read_excel('./5_Data_Analysis/' + gene_set + '/LinearRegression/M' + model + '/Linear_Regression_R2_SCORES.xlsx', sheetname='Sheet1', header=0) # Extract and store the results in the summary dataframe n_features = feature_sel_df.get_value(current_gene, 'TOT Inital N° Features') n_feat_discarded = feature_sel_df.get_value(current_gene, 'Discarded Features') if not (model == '2'): n_features_available = feature_sel_df.get_value( current_gene, 'Features Available for Selection') n_feat_selected = feature_sel_df.get_value(current_gene, 'N° Features Selected') lin_reg_r2_adj = lin_reg_df.get_value(current_gene, 'Adj.R2') lin_reg_r2 = lin_reg_df.get_value(current_gene, 'R2') lr_summary_df.set_value(current_gene, 'Inital N° Features', n_features) lr_summary_df.set_value(current_gene, 'Discarded Features', n_feat_discarded) if not (model == '2'): lr_summary_df.set_value(current_gene, 'Features Available for Selection', n_features_available) lr_summary_df.set_value(current_gene, 'N° Features Selected', n_feat_selected) lr_summary_df.set_value(current_gene, 'Adj.R2', lin_reg_r2_adj) lr_summary_df.set_value(current_gene, 'R2', lin_reg_r2) # Export the summary dataframe in an Excel file lr_summary_df = lr_summary_df.sort_values(by=['Adj.R2'], ascending=[False]) filename = './5_Data_Analysis/' + gene_set + '/Feature_Selection_and_Linear_Regression.xlsx' writer = ExcelWriter(filename, engine='openpyxl') try: writer.book = load_workbook(filename) writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets) except IOError: # if the file does not exist yet, I will create it pass lr_summary_df.to_excel(writer, 'M' + model) writer.save() # Extract relevant features for each gene of the current gene set and store them in a summary table and define a dataframe to summarize the features selected for each model gene features_summary_df = pd.DataFrame(index=SYMs_current_pathway) for current_gene in SYMs_current_pathway: gene_ID = EntrezConversion_df.loc[EntrezConversion_df['GENE_SYMBOL'] == current_gene, 'ENTREZ_GENE_ID'].iloc[0] # Import the regression coefficients coeff_df = pd.read_excel( './5_Data_Analysis/' + gene_set + '/LinearRegression/M' + model + '/Coefficients/Coefficients_(M' + model + ')-Gene_' + gene_ID + '_[' + current_gene + '].xlsx', sheetname='Sheet1', header=0) # Import the confidence intervals ci_df = pd.read_excel( './5_Data_Analysis/' + gene_set + '/LinearRegression/M' + model + '/ConfidenceIntervals/Confidence_Intervals_(M' + model + ')-Gene_' + gene_ID + '_[' + current_gene + '].xlsx', sheetname='Sheet1', header=0) # Import the correlation matrix corr_df = pd.read_excel( './5_Data_Analysis/' + gene_set + '/LinearRegression/M' + model + '/CorrelationMatrix/Correlation_Matrix_(M' + model + ')-Gene_' + gene_ID + '_[' + current_gene + '].xlsx', sheetname='Sheet1', header=0) # Select the relevant features on the basis of the confidence intervals (i.e. if the confidence interval does not contain 0, then the feature is significant for the model) relevant_features = [] for index, row in ci_df.iterrows(): s = row['Significant Feature?'] if s == 'YES': relevant_features.append(index) # Create a dataframe to store the results and fill it with requested information relevant_features_df = pd.DataFrame( index=relevant_features, columns=[ 'Regression Coefficient', 'Feature Description', 'Correlation with EXPRESSION (' + current_gene + ')' ]) for index, row in coeff_df.iterrows(): gene = row['feature'] if gene in relevant_features: coeff = row['coefficient'] relevant_features_df.set_value(gene, 'Regression Coefficient', coeff) for index, row in corr_df.iterrows(): if index in relevant_features: corr_with_target = row['EXPRESSION (' + current_gene + ')'] relevant_features_df.set_value( index, 'Correlation with EXPRESSION (' + current_gene + ')', corr_with_target) # Add the features descriptions if model == '2': for f in relevant_features: if f in SYMs_current_pathway: descr = 'Gene of the ' + gene_set + ' set' relevant_features_df.set_value(f, 'Feature Description', descr) elif 'METHYLATION' in f: descr = 'Methylation of the model gene [' + current_gene + '] in the ' + gene_set + ' set' relevant_features_df.set_value(f, 'Feature Description', descr) elif f in dict_RegulGenes[current_gene]: descr = 'Candidate regulatory gene of the model gene [' + current_gene + '] of the ' + gene_set + ' set' relevant_features_df.set_value(f, 'Feature Description', descr) elif model == '3': for f in relevant_features: if f in SYMs_current_pathway: descr = 'Gene of the ' + gene_set + ' set' relevant_features_df.set_value(f, 'Feature Description', descr) elif 'METHYLATION' in f: descr = 'Methylation of the model gene [' + current_gene + '] in the ' + gene_set + ' set' relevant_features_df.set_value(f, 'Feature Description', descr) elif f in dict_RegulGenes[current_gene]: descr = 'Candidate regulatory gene of the model gene [' + current_gene + '] of the ' + gene_set + ' set' relevant_features_df.set_value(f, 'Feature Description', descr) elif not (f in dict_RegulGenes[current_gene]) and ( f in current_regulatory_genes): descr = 'Candidate regulatory gene of the genes in the ' + gene_set + ' set' relevant_features_df.set_value(f, 'Feature Description', descr) elif model == '5': for f in relevant_features: if f in SYMs_current_pathway: descr = 'Gene of the ' + gene_set + ' set' relevant_features_df.set_value(f, 'Feature Description', descr) elif 'METHYLATION' in f: descr = 'Methylation of the model gene [' + current_gene + '] in the ' + gene_set + ' set' relevant_features_df.set_value(f, 'Feature Description', descr) elif f in dict_RegulGenes[current_gene]: descr = 'Candidate regulatory gene of the model gene [' + current_gene + '] of the ' + gene_set + ' set' relevant_features_df.set_value(f, 'Feature Description', descr) elif not (f in dict_RegulGenes[current_gene]) and ( f in current_regulatory_genes): descr = 'Candidate regulatory gene of the genes in the ' + gene_set + ' set' relevant_features_df.set_value(f, 'Feature Description', descr) elif f in SYMs_other_pathways: df_temp = EntrezConversion_df.loc[ EntrezConversion_df['GENE_SYMBOL'] == f].copy() f_pathways = (df_temp.GENE_SET.unique()).tolist() descr = 'Gene of the gene sets: ' + (', '.join(f_pathways)) relevant_features_df.set_value(f, 'Feature Description', descr) elif f in regulatory_genes_other: regulated_genes_other = [] for key, value in dict_RegulGenes.items(): if key in SYMs_other_pathways: if f in value: regulated_genes_other.append(key) df_temp = EntrezConversion_df.loc[EntrezConversion_df[ 'GENE_SYMBOL'].isin(regulated_genes_other)].copy() f_pathways = (df_temp.GENE_SET.unique()).tolist() descr = 'Candidate regulatory gene of the gene sets: ' + ( ', '.join(f_pathways)) relevant_features_df.set_value(f, 'Feature Description', descr) # Export the dataframe in an Excel file relevant_features_df = relevant_features_df.sort_values( by=['Regression Coefficient'], ascending=[False]) filename = './5_Data_Analysis/' + gene_set + '/Relevant_Features-Gene_' + gene_ID + '_[' + current_gene + '].xlsx' writer = ExcelWriter(filename, engine='openpyxl') try: writer.book = load_workbook(filename) writer.sheets = dict( (ws.title, ws) for ws in writer.book.worksheets) except IOError: # if the file does not exist yet, I will create it pass relevant_features_df.to_excel(writer, 'M' + model) writer.save() relevance_order = 0 for index, row in relevant_features_df.iterrows(): relevance_order = relevance_order + 1 str_order = str(relevance_order) features_summary_df.set_value(current_gene, index, str_order) # Export the summary dataframe in an Excel file filename = './5_Data_Analysis/' + gene_set + '/Order_of_Features_Selected.xlsx' writer = ExcelWriter(filename, engine='openpyxl') try: writer.book = load_workbook(filename) writer.sheets = dict((ws.title, ws) for ws in writer.book.worksheets) except IOError: # if the file does not exist yet, I will create it pass features_summary_df.to_excel(writer, 'M' + model) writer.save()
def best_genes(gene_set): """ The BEST_GENES operation collects the target genes with the best linear fit (Adjusted R2 >= 0.6) in the three regression models, storing them locally in a single Excel file. :param gene_set: the set of genes of interest to summarize Example:: import genereg as gr gr.SummaryResults.best_genes(gene_set='DNA_REPAIR') """ # Define the models to summarize models = ['2', '3', '5'] # Import the list of genes of interest and extract in a list the Gene Symbols of all the genes belonging to the current gene set EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx', sheetname='Sheet1', header=0, converters={ 'GENE_SYMBOL': str, 'ENTREZ_GENE_ID': str, 'GENE_SET': str }) SYMs_current_pathway = [] for index, row in EntrezConversion_df.iterrows(): sym = row['GENE_SYMBOL'] path = row['GENE_SET'] if path == gene_set: SYMs_current_pathway.append(sym) for model in models: # Import the summary table cointaining the value of the R2 for each model and for each gene of interest in the current gene set # and extract the list of "good" genes, the ones that have R2 >= 0.6 in the current model summary_r2_df = pd.read_excel('./5_Data_Analysis/' + gene_set + '/R2_and_Adj.R2_Scores.xlsx', sheetname='Sheet1', header=0) summary_r2_df = summary_r2_df.sort_values( by=['Adj.R2 (M' + model + ')'], ascending=[False]) good_genes = [] for index, row in summary_r2_df.iterrows(): current_model_r2 = row['Adj.R2 (M' + model + ')'] if current_model_r2 >= 0.6: good_genes.append(index) # Create a dataframe to the store the results, indexed by the "good" genes and a progressive number for each significant feature extracted during the regression process num_features = [] for i in list(range(1, 20)): num_features.append(i) # Cartesian product to generate tuples for multi-indexing import itertools tuples = [] for i in itertools.product(good_genes, num_features): tuples.append(i) # Set the multiple indexes to be used in the dataframe index = pd.MultiIndex.from_tuples(tuples, names=['GENE', '#']) # Create the dataframe and initialize the empty cells as empty strings final_summary_df = pd.DataFrame('', index=index, columns=[ 'Significant Feature', 'Adj.R2', 'Regression Coefficient', 'Feature Description' ]) # Fill the dictionary # Adjusted R2 for current_gene in good_genes: r2 = summary_r2_df.get_value(current_gene, 'Adj.R2 (M' + model + ')') final_summary_df.loc[(current_gene, 1), 'Adj.R2'] = r2 # Features for current_gene in good_genes: # Import the table containing the significant features extracted for the current gene gene_ID = EntrezConversion_df.loc[ EntrezConversion_df['GENE_SYMBOL'] == current_gene, 'ENTREZ_GENE_ID'].iloc[0] features_df = pd.read_excel('./5_Data_Analysis/' + gene_set + '/Relevant_Features-Gene_' + gene_ID + '_[' + current_gene + '].xlsx', sheetname='M' + model, header=0) feature_counter = 1 for index, row in features_df.iterrows(): coeff = row['Regression Coefficient'] descr = row['Feature Description'] final_summary_df.loc[(current_gene, feature_counter), 'Significant Feature'] = index final_summary_df.loc[(current_gene, feature_counter), 'Regression Coefficient'] = coeff final_summary_df.loc[(current_gene, feature_counter), 'Feature Description'] = descr feature_counter = feature_counter + 1 # Remove the empty rows in the dataframe for index, row in final_summary_df.iterrows(): feat = row['Significant Feature'] coeff = row['Regression Coefficient'] descr = row['Feature Description'] if (feat == '') & (coeff == '') & (descr == ''): final_summary_df.drop(index, inplace=True) # Export the summary dataframe in an Excel file filename = './5_Data_Analysis/' + gene_set + '/Best_Genes.xlsx' writer = ExcelWriter(filename, engine='openpyxl') try: writer.book = load_workbook(filename) writer.sheets = dict( (ws.title, ws) for ws in writer.book.worksheets) except IOError: # if the file does not exist yet, I will create it pass final_summary_df.to_excel(writer, 'M' + model) writer.save()
def append_df_to_excel(self, df, sheet_name='Sheet1', startrow=None, truncate_sheet=False, **to_excel_kwargs): ''' Append a DataFrame [df] to existing Excel file [filename] into [sheet_name] Sheet. If [filename] doesn't exist, then this function will create it. Parameters: filename : File path or existing ExcelWriter (Example: '/path/to/file.xlsx') df : dataframe to save to workbook sheet_name : Name of sheet which will contain DataFrame. (default: 'Sheet1') startrow : upper left cell row to dump data frame. Per default (startrow=None) calculate the last row in the existing DF and write to the next row... truncate_sheet : truncate (remove and recreate) [sheet_name] before writing DataFrame to Excel file to_excel_kwargs : arguments which will be passed to `DataFrame.to_excel()` [can be dictionary] Returns: None ''' from openpyxl import load_workbook # ignore [engine] parameter if it was passed if 'engine' in to_excel_kwargs: to_excel_kwargs.pop('engine') #writer = pd.ExcelWriter(self.file_name, engine='openpyxl') writer = ExcelWriter(self.file_name, engine='openpyxl') try: # try to open an existing workbook writer.book = load_workbook(self.file_name) # get the last row in the existing Excel sheet # if it was not specified explicitly if startrow is None and sheet_name in writer.book.sheetnames: startrow = writer.book[sheet_name].max_row # truncate sheet if truncate_sheet and sheet_name in writer.book.sheetnames: # index of [sheet_name] sheet idx = writer.book.sheetnames.index(sheet_name) # remove [sheet_name] writer.book.remove(writer.book.worksheets[idx]) # create an empty sheet [sheet_name] using old index writer.book.create_sheet(sheet_name, idx) # copy existing sheets writer.sheets = {ws.title: ws for ws in writer.book.worksheets} except FileNotFoundError: # file does not exist yet, we will create it pass if startrow is None: startrow = 0 # write out the new sheet df.to_excel(writer, sheet_name, startrow=startrow, **to_excel_kwargs) # save the workbook writer.save()
def CreateFileExcel(pathin,pathout): #filename = "Config_Setting.csv" pathconf = PathSteel(dir_path =pathin, FileName = "Config_Setting.csv")\ .refpath() #fullpath = os.path.join(pathin,filename) # get path store data to handling Right_Genneral_All_path = PathFromFileNameAndDirpath(dir_path =pathin, filename ="Right_Genneral_All.csv" ) Left_Genneral_All_path = PathFromFileNameAndDirpath(dir_path =pathin, filename ="Left_Genneral_All.csv" ) # key value keyvalue = ["ValueGeneral", "Columnmove", "LocationCellForMoveColumn", "GenneralColumnNotChange", "GeneralConcernRaffter", "Genneral_Select", "LocationOfRowLeft", "LocationOfRowRight", "ExcelCellForMoveColumnRight", "LocationOfPurlin", "startrow", ] #location in conf locvalue = [12,16,17,19,20,21,23,24,25,26,30] credict_c = credict(KeyValues = keyvalue, LocConf = locvalue, Config_Setting_Path = pathconf) credict_list = credict_c.Dictfromkeyandvalueconf() # create arr from keyvalue valgen = credict_list.get("ValueGeneral", "") colmv = credict_list.get("Columnmove", "") locmvcol = credict_list.get("LocationCellForMoveColumn", "") gencolnotchg = credict_list.get("GenneralColumnNotChange", "") genconraf = credict_list.get("GeneralConcernRaffter", "") gensel = credict_list.get("Genneral_Select", "") locrowleft = credict_list.get("LocationOfRowLeft", "") locrowright = credict_list.get("LocationOfRowRight", "") excemvcolright = credict_list.get("ExcelCellForMoveColumnRight", "") locpur = credict_list.get("LocationOfPurlin", "") strow = credict_list.get("startrow", "") # data template if IsRunningInPyinstallerBundle(): #NameFile = ExtractFileNameFromPath(DataExcel) DataExcel = resource_path_is_from_pyinstall_and_dev(FileName = 'DataALL - Template.xlsx', Subfolder="Data", Is_Directory_Path_To_SubFolder= True, dir_path=sys._MEIPASS) else: # get file excel from template excel (full path) DataExcel = PathSteel(modulename =templatexc, FileName ='DataALL - Template.xlsx')\ .getpathmodule() """ DataExcel = os.path.join((os.path.dirname(templatexc.__file__)), 'DataALL - Template.xlsx') """ book = load_workbook(DataExcel) writer = ExcelWriter(DataExcel, engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) for path in [Left_Genneral_All_path, Right_Genneral_All_path]: df1 = pd.read_csv(path, delimiter=',', index_col = 0) dfCount = df1.shape df1.to_csv(path) #write Left to Excel dfValueGeneral = pd.read_csv(path, delimiter=',', usecols = valgen, nrows= 1) dfValueGeneral.to_excel(writer,'General Member', index=False,header=True , startcol=0,startrow \ = strow[0]) worksheet = writer.sheets['General Member'] # create frame to excel excellframe = toexcel(worksheet = worksheet, path = path, path_conf =pathconf, lpath=Left_Genneral_All_path, rpath = Right_Genneral_All_path) if path == Left_Genneral_All_path: usecolsArr = locrowleft LocationMoveColumn = locmvcol #write path to excel excellframe.wrivaltoexc() else: usecolsArr = locrowright LocationMoveColumn = excemvcolright #write path to excel excellframe.wrivaltoexc() #Write genneral to excel DfChangegenneral = pd.read_csv(path, delimiter=',', usecols = gencolnotchg, nrows= 1 ) DfChangegenneral.to_excel(writer, 'General Member', index=False, header=True, startcol=0, startrow= int(usecolsArr[1]) ) #write Genneral Concern Raffter DfChangegenneral = pd.read_csv(path, delimiter=',', usecols = genconraf ) DfChangegenneral.to_excel(writer, 'General Member', index=False, header=True , startcol=0, startrow=int(usecolsArr[2]) ) #write genneral selected DfChangegenneral = pd.read_csv(path, delimiter=',', usecols = gensel, nrows= 1) DfChangegenneral.to_excel(writer, 'General Member', index=False, header=True , startcol=0, startrow= int(usecolsArr[0]) ) #write purlin roof DfChangegenneral = pd.read_csv(path, delimiter=',', usecols = locpur, nrows= 1) DfChangegenneral.to_excel(writer, 'General Member', index=False, header=True , startcol=0, startrow= int(usecolsArr[3])) worksheet = writer.sheets['General Member'] #Wirte move Column to excel excellframe.writemovecol(LocationMoveColumn, colmv) # create full path from dirpath path = PathSteel(dir_path =pathout, FileName ='new_big_file.xlsx')\ .refpath() #path = os.path.join(pathout,'new_big_file.xlsx') book.save(path)
def generate_ferc_intr_excel(mongo, start_date, end_date, dataframes): source_file = conf.FERC_TEMPLATE_FILE book = openpyxl.load_workbook(source_file) start_row_index = 23 sheet_name = conf.FERC_INTR_FILE_NAME sheet_name = "{}_{}.xlsx".format(sheet_name, end_date.year) file_path = os.path.join(conf.GRC_FILE_DIRS[0], sheet_name) writer = ExcelWriter(file_path, engine='openpyxl', options={'strings_to_numbers': True}) writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) oh_conductor_df = dataframes.get("oh_conductor_df") ug_conductor_df = dataframes.get("ug_conductor_df") oh_kw_df = dataframes.get("oh_kw_df") ug_kw_df = dataframes.get("ug_kw_df") conductor_df_columns = [ "", "TLINE_NM", "DESIGNATION_B", "RATEDKV", "NOMINAL_VOLTAGE", "STRUC_TYPE", "CIRCUIT_MI", "", "NUMBER_CIR", "CONDUCTOR SEGMENT" ] oh_conductor_df[""] = "" oh_conductor_df = oh_conductor_df.apply(lambda x: set_designation(x), axis=1) print(oh_conductor_df) oh_conductor_df_new = oh_conductor_df[conductor_df_columns] oh_conductor_df_new.to_excel(writer, sheet_name="Page 422-423", startrow=start_row_index, startcol=0, header=False, index=False) start_row_index = start_row_index + oh_conductor_df.shape[0] + 1 ug_conductor_df[""] = "" ug_conductor_df = ug_conductor_df.apply(lambda x: set_designation(x), axis=1) ug_conductor_df_new = ug_conductor_df[conductor_df_columns] ug_conductor_df_new.to_excel(writer, sheet_name="Page 422-423", startrow=start_row_index, startcol=0, header=False, index=False) start_row_index = start_row_index + ug_conductor_df.shape[0] + 5 oh_kw_df[""] = "" oh_kw_df = oh_kw_df[["", "", "", "RATEDKV", "", "", "SUM_Mi"]] oh_kw_df["RATEDKV"] = oh_kw_df["RATEDKV"].apply(lambda x: str(x) + "KV" if x else "") oh_kw_df["SUM_Mi"] = oh_kw_df["SUM_Mi"].apply(lambda x: "{:.2f}".format(x) if x else x) oh_kw_df.to_excel(writer, sheet_name="Page 422-423", startrow=start_row_index, startcol=0, header=False, index=False) old_start_row_index = start_row_index start_row_index = start_row_index + oh_kw_df.shape[0] + 2 ug_kw_df[""] = "" ug_kw_df["RATEDKV"] = ug_kw_df["RATEDKV"].apply(lambda x: str(x) + "KV" if x else "") ug_kw_df["SUM_Mi"] = ug_kw_df["SUM_Mi"].apply(lambda x: "{:.2f}".format(x) if x else x) ug_kw_df = ug_kw_df[["", "", "", "RATEDKV", "", "", "SUM_Mi"]] ug_kw_df.to_excel(writer, sheet_name="Page 422-423", startrow=start_row_index, startcol=0, header=False, index=False) max_rows = start_row_index + ug_kw_df.shape[0] + 2 formatter.format_ferc_report(writer.sheets["Page 422-423"], old_start_row_index, start_row_index, max_rows, end_date.year) writer.save() for ffname in conf.GRC_FILE_DIRS[1:]: dest_path = os.path.join(ffname, sheet_name) shutil.copy(file_path, dest_path) return sheet_name
def main(): excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Assignment 4/Assignment_4_Data_and_Template_Original.xlsx" sheetname = "Training Data" # df = readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=6601, endcol=17) # print df # print df[0] # print df[-1] X = np.array(readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=6601, endcol=15), dtype=np.int32) # print X Xa = np.insert(X, 0, 1, axis=1) # print Xa T = np.array(readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=6601, startcol=16, endcol=16), dtype=np.int32) # print T T2n = np.array(readExcel(excelfile, sheetname=sheetname, startrow=2, endrow=6601, startcol=17, endcol=17), dtype=np.int32) # print T2n mapping = [ [1, -1, -1, -1, -1, -1], [-1, 1, -1, -1, -1, -1], [-1, -1, 1, -1, -1, -1], [-1, -1, -1, 1, -1, -1], [-1, -1, -1, -1, 1, -1], [-1, -1, -1, -1, -1, 1] ] T2 = np.array([mapping[row[0]] for row in T2n], dtype=np.int32) # print T2 Xapi = np.linalg.pinv(Xa) # print Xapi W = np.dot(Xapi, T) # print W W2 = np.dot(Xapi, T2) # print W2 sheetname = "To be classified" Xt = np.array(readExcel(excelfile, sheetname=sheetname, startrow=5, endrow=54, endcol=15), dtype=np.int32) # print Xt # print Xt[0] Xta = np.insert(Xt, 0, 1, axis=1) # print Xta T_testing = np.dot(Xta, W) # print T_testing T_testing = np.array([1 if T_testing[i][0] > 0 else -1 for i in range(len(T_testing))], dtype=np.int32) # print T_testing T2_testing = np.dot(Xta, W2) # print T2_testing T2_testing = np.argmax(T2_testing, axis=1) # print T2_testing T_validating = np.dot(Xa, W) # print T_validating T_validating = np.array([1 if T_validating[i][0] > 0 else -1 for i in range(len(T_validating))], dtype=np.int32) # print T_validating T2_validating = np.dot(Xa, W2) # print T2_validating T2_validating = np.argmax(T2_validating, axis=1) # print T2_validating bccm = np.zeros((2, 2), dtype=np.int32) for i in range(len(T)): row = 1 if T[i, 0] == 1 else 0 col = 1 if T_validating[i] == 1 else 0 # print row, col bccm[row, col] += 1 print bccm metrics = [ np.divide(bccm[0, 0] + bccm[1, 1], bccm[0, 0] + bccm[0, 1] + bccm[1, 0] + bccm[1, 1], dtype=np.float64), np.divide(bccm[1, 1], bccm[1, 0] + bccm[1, 1], dtype=np.float64), np.divide(bccm[0, 0], bccm[0, 0] + bccm[0, 1], dtype=np.float64), np.divide(bccm[1, 1], bccm[0, 1] + bccm[1, 1], dtype=np.float64) ] print metrics mcccm = np.zeros((6, 6), dtype=np.int32) for i in range(len(T2n)): row = T2n[i, 0] col = T2_validating[i] # print row, col mcccm[row, col] += 1 print mcccm sumup = np.sum(mcccm, axis=0) # print sumup ppv = np.zeros(6) for i in range(6): ppv[i] = np.divide(mcccm[i, i], sumup[i], dtype=np.float64) print ppv ppvmetrics = [[max(ppv), np.argmax(ppv)], [min(ppv), np.argmin(ppv)]] print ppvmetrics from pandas import DataFrame, ExcelWriter from openpyxl import load_workbook excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Assignment 4/Assignment_4_Data_and_Template_Updated.xlsx" book = load_workbook(excelfile) writer = ExcelWriter(excelfile, engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) sheetname = "Classifiers" # print W df = DataFrame(W) df.to_excel(writer, sheet_name=sheetname, startrow=4, startcol=0, header=False, index=False) # print W2 df = DataFrame(W2) df.to_excel(writer, sheet_name=sheetname, startrow=4, startcol=4, header=False, index=False) sheetname = "To be classified" # print Tt df = DataFrame(T_testing) df.to_excel(writer, sheet_name=sheetname, startrow=4, startcol=15, header=False, index=False) # print Tt2 df = DataFrame(T2_testing) df.to_excel(writer, sheet_name=sheetname, startrow=4, startcol=16, header=False, index=False) sheetname = "Performance" # print bccm df = DataFrame(bccm) df.to_excel(writer, sheet_name=sheetname, startrow=9, startcol=2, header=False, index=False) # print metrics df = DataFrame(metrics) df.to_excel(writer, sheet_name=sheetname, startrow=7, startcol=6, header=False, index=False) # print mcccm df = DataFrame(mcccm) df.to_excel(writer, sheet_name=sheetname, startrow=18, startcol=2, header=False, index=False) # print ppvmetrics df = DataFrame(ppvmetrics) df.to_excel(writer, sheet_name=sheetname, startrow=19, startcol=11, header=False, index=False) writer.save() writer.close()
def main(): selected_digits = [6, 9] X, T = load_mnist("training", selected_digits) X = np.array(X, np.float64) T = np.array(T, np.float64) # print type(X) # print type(T) # print X.shape # print T.shape # Xn = np.array([[72., 101., 94.], [50., 96., 70.], [14., 79., 10.], [8., 70., 1.]], np.float64) U, V, P = XZCVPR(X, T, selected_digits) n = [0, 0] n[0] = len(T[T == selected_digits[0]]) n[1] = len(T[T == selected_digits[1]]) # B = int(np.ceil(np.log2(n[0] + n[1]))) + 1 B = 32 # print B Prec = P[:, 0:2] # print Prec[:, 0] # print Prec[:, 1] histograms, maxs, mins = build_2d_histograms(Prec, T, B, selected_digits) mus, Sigmas = build_2d_beysian(Prec, T, selected_digits) Xt, Tt = load_mnist("testing", selected_digits) Xt = np.array(Xt, np.float64) Tt = np.array(Tt, np.float64) # print Tt # Get principal component matrix of Xt Pt = PCA(Xt, U, V) # print Pt # Get redecued principal component matrix of Pt Ptrec = Pt[:, 0:2] # print Ptrec[0] # print Ptrec[1] Tth = np.zeros((len(Tt), 2)) Ttb = np.zeros((len(Tt), 2)) for i in range(len(Tt)): ccps_by_histograms = get_ccps_by_2d_histograms(B, histograms, maxs, mins, n, Ptrec[i]) # print ccps_by_histograms pps_by_histograms = posterior_probability(ccps_by_histograms, n) # print pps_by_histograms if pps_by_histograms[0] == pps_by_histograms[1]: Tth[i] = (-1., 0.) else: if pps_by_histograms[0] > pps_by_histograms[1]: Tth[i] = (str(selected_digits[0]), pps_by_histograms[0]) else: Tth[i] = (str(selected_digits[1]), pps_by_histograms[1]) ccps_by_beysian = get_ccps_by_2d_beysian(Ptrec[i], mus, Sigmas) # print ccps_by_beysian pps_by_beysian = posterior_probability(ccps_by_beysian, n) # print pps_by_beysian if pps_by_beysian[0] == pps_by_beysian[1]: Ttb[i] = ("Undecidable", 0.) else: if pps_by_beysian[0] > pps_by_beysian[1]: Ttb[i] = (str(selected_digits[0]), pps_by_beysian[0]) else: Ttb[i] = (str(selected_digits[1]), pps_by_beysian[1]) # print Tth # print Ttb # print len(Tt[Tt == Tth]) # print len(Tt[Tt == Ttb]) idxp = 2 idxn = 1 # vector_to_img(Xt[idxp], Xt[idxn]) xp = Xt[idxp] xn = Xt[idxn] # print xp # print xn zp = xp - U zn = xn - U # print zp # print zn Pp = np.dot(zp, V.T) Pn = np.dot(zn, V.T) # print Pp # print Pn Precp = Pp[0:2] Precn = Pn[0:2] # print Precp # print Precn Rp = np.dot(Precp, V[0:2, :]) Rn = np.dot(Precn, V[0:2, :]) # print Rp # print Rn Xrecp = Rp + U Xrecn = Rn + U # print Xrecp # print Xrecn Thp = Tth[idxp] Tbp = Ttb[idxp] # print Thp # print Tbp Thn = Tth[idxn] Tbn = Ttb[idxn] # print Thn # print Tbn ch = 0 cb = 0 for i in range(len(Tt)): # print Tth[i][0], Tt[i][0], Tth[i][0] == Tt[i][0] if Tth[i][0] == Tt[i][0]: ch += 1 # print Ttb[i][0], Tt[i][0], Ttb[i][0] == Tt[i][0] if Ttb[i][0] == Tt[i][0]: cb += 1 # print len(Tt), ch, cb accuracyh = np.divide(ch, len(Tt), dtype=np.float64) accuracyb = np.divide(cb, len(Tt), dtype=np.float64) # print accuracyh # print accuracyb from pandas import DataFrame, ExcelWriter from openpyxl import load_workbook excelfile = "/Users/ChiYuChen/Intro to Machine Learning and Data Mining/Assignment 3/Assignment_3_Submission_Template_Copy.xlsx" sheetname = "Results" book = load_workbook(excelfile) writer = ExcelWriter(excelfile, engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) # print [U] df = DataFrame([U]) df.to_excel(writer, sheet_name=sheetname, startrow=1, startcol=1, header=False, index=False) # print [V[0, :]] df = DataFrame([V[0, :]]) df.to_excel(writer, sheet_name=sheetname, startrow=2, startcol=1, header=False, index=False) # print [V[1, :]] df = DataFrame([V[1, :]]) df.to_excel(writer, sheet_name=sheetname, startrow=3, startcol=1, header=False, index=False) # print [n[0]] df = DataFrame([n[0]]) df.to_excel(writer, sheet_name=sheetname, startrow=5, startcol=1, header=False, index=False) # print [n[1]] df = DataFrame([n[1]]) df.to_excel(writer, sheet_name=sheetname, startrow=6, startcol=1, header=False, index=False) # print [mus[0]] df = DataFrame([mus[0]]) df.to_excel(writer, sheet_name=sheetname, startrow=8, startcol=1, header=False, index=False) # print [mus[1]] df = DataFrame([mus[1]]) df.to_excel(writer, sheet_name=sheetname, startrow=9, startcol=1, header=False, index=False) # print Sigmas[0] df = DataFrame(Sigmas[0]) df.to_excel(writer, sheet_name=sheetname, startrow=11, startcol=1, header=False, index=False) # print Sigmas[1] df = DataFrame(Sigmas[1]) df.to_excel(writer, sheet_name=sheetname, startrow=13, startcol=1, header=False, index=False) # print maxs[0], mins[0] df = DataFrame([[maxs[0], mins[0]]]) df.to_excel(writer, sheet_name=sheetname, startrow=16, startcol=1, header=False, index=False) # print maxs[1], mins[1] df = DataFrame([[maxs[1], mins[1]]]) df.to_excel(writer, sheet_name=sheetname, startrow=17, startcol=1, header=False, index=False) # print histograms[0] # print histograms[0][0, :] df = DataFrame(histograms[0]) df.to_excel(writer, sheet_name=sheetname, startrow=19, startcol=1, header=False, index=False) # print histograms[1] # print histograms[1][0, :] df = DataFrame(histograms[1]) df.to_excel(writer, sheet_name=sheetname, startrow=52, startcol=1, header=False, index=False) # print xp df = DataFrame([xp]) df.to_excel(writer, sheet_name=sheetname, startrow=87, startcol=1, header=False, index=False) # print zp df = DataFrame([zp]) df.to_excel(writer, sheet_name=sheetname, startrow=88, startcol=1, header=False, index=False) # print Precp df = DataFrame([Precp]) df.to_excel(writer, sheet_name=sheetname, startrow=89, startcol=1, header=False, index=False) # print Rp df = DataFrame([Rp]) df.to_excel(writer, sheet_name=sheetname, startrow=90, startcol=1, header=False, index=False) # print Xrecp df = DataFrame([Xrecp]) df.to_excel(writer, sheet_name=sheetname, startrow=91, startcol=1, header=False, index=False) # print xn df = DataFrame([xn]) df.to_excel(writer, sheet_name=sheetname, startrow=93, startcol=1, header=False, index=False) # print zn df = DataFrame([zn]) df.to_excel(writer, sheet_name=sheetname, startrow=94, startcol=1, header=False, index=False) # print Precn df = DataFrame([Precn]) df.to_excel(writer, sheet_name=sheetname, startrow=95, startcol=1, header=False, index=False) # print Rn df = DataFrame([Rn]) df.to_excel(writer, sheet_name=sheetname, startrow=96, startcol=1, header=False, index=False) # print Xrecn df = DataFrame([Xrecn]) df.to_excel(writer, sheet_name=sheetname, startrow=97, startcol=1, header=False, index=False) # print selected_digits[0] df = DataFrame([selected_digits[0]]) df.to_excel(writer, sheet_name=sheetname, startrow=101, startcol=1, header=False, index=False) # print Thp[0], Thp[1] df = DataFrame([[Thp[0], Thp[1]]]) df.to_excel(writer, sheet_name=sheetname, startrow=102, startcol=1, header=False, index=False) # print Tbp[0], Tbp[1] df = DataFrame([[Tbp[0], Tbp[1]]]) df.to_excel(writer, sheet_name=sheetname, startrow=103, startcol=1, header=False, index=False) # print selected_digits[1] df = DataFrame([selected_digits[1]]) df.to_excel(writer, sheet_name=sheetname, startrow=105, startcol=1, header=False, index=False) # print Thn[0], Thn[1] df = DataFrame([[Thn[0], Thn[1]]]) df.to_excel(writer, sheet_name=sheetname, startrow=106, startcol=1, header=False, index=False) # print Tbn[0], Tbn[1] df = DataFrame([[Tbn[0], Tbn[1]]]) df.to_excel(writer, sheet_name=sheetname, startrow=107, startcol=1, header=False, index=False) # print accuracyh df = DataFrame([[accuracyh]]) df.to_excel(writer, sheet_name=sheetname, startrow=110, startcol=1, header=False, index=False) # print accuracyb df = DataFrame([[accuracyb]]) df.to_excel(writer, sheet_name=sheetname, startrow=111, startcol=1, header=False, index=False) writer.save() writer.close()