def DFtoExcel(df, FolderName, FileName): write_df = df.loc[:, ["FileName", "hyperlink", "Sheet Name"]] # Path Cell_Search_By_Key MainFolder = "C:\\Cell_Search_By_Key" FolderPath = os.path.join(MainFolder, FolderName) if not os.path.exists(FolderPath): os.makedirs(FolderPath) os.chdir(FolderPath) ExcelName = "%s.xlsx" % FileName writer = ExcelWriter(ExcelName) write_df.to_excel(writer, "Result", index=False) writer.save() # turn path into hyperlink Excel_Path = os.path.join(FolderPath, ExcelName) wb = Workbook(Excel_Path) # wb = Workbook.caller() checkArr = Range("B2").vertical.value i = 2 for check in checkArr: RangeName = "B%d" % (i) displayRange = "A%d" % (i) address = Range(RangeName).value display_name = Range(displayRange).value i += 1 try: Range(RangeName).add_hyperlink(address, text_to_display=address) except: pass wb.save() wb.close() return "FINISH"
def generate_report(title, description): """Generate Excel 1997 file from query. :param title: Query title. :param description: Query description. :return: Response with Excel 1997 attachment. """ df = load_data_frame(request) # Limit the columns to the maximum allowed in Excel 97. max_length = 255 index_len = len(df.index.names) lim_df = df.drop(df.columns[max_length - index_len - 1:len(df.columns) - 1], axis=1) extension = 'xls' engine = 'xlwt' encoding = 'utf-8' content_type = 'application/vnd.ms-excel' # Add content and return response f = NamedTemporaryFile(suffix=extension) ew = ExcelWriter(f.name, engine=engine, encoding=encoding) #print lim_df.to_string() #print f.name lim_df.to_excel(ew) ew.save() #shutil.copyfile(f.name, 'manuel.xls') show_legend = request.REQUEST.get('show_legend', '') table_description = request.REQUEST.get('table_description', '') add_header_and_footer(f.name, title, description, show_legend, table_description) title = title.strip().encode("UTF-8").replace(" ", '_') if len(title) > max_length_filename: title = title[:max_length_filename] filename = '%s.%s' % (title, extension) # Setup response data = f.read() response = HttpResponse(data) response["Content-Type"] = content_type response["Content-status_code"] = 200 response['Content-Transfer-Encoding'] = 'binary' response['Content-Disposition'] = 'attachment; filename="%s"' % filename return response
class Excel(object): goal_time = 0.2 params = ['openpyxl', 'xlsxwriter', 'xlwt'] param_names = ['engine'] def setup(self, engine): N = 2000 C = 5 self.df = DataFrame(np.random.randn(N, C), columns=['float{}'.format(i) for i in range(C)], index=date_range('20000101', periods=N, freq='H')) self.df['object'] = tm.makeStringIndex(N) self.bio_read = BytesIO() self.writer_read = ExcelWriter(self.bio_read, engine=engine) self.df.to_excel(self.writer_read, sheet_name='Sheet1') self.writer_read.save() self.bio_read.seek(0) self.bio_write = BytesIO() self.bio_write.seek(0) self.writer_write = ExcelWriter(self.bio_write, engine=engine) def time_read_excel(self, engine): read_excel(self.bio_read) def time_write_excel(self, engine): self.df.to_excel(self.writer_write, sheet_name='Sheet1') self.writer_write.save()
def create_output(regression_dist_dict, closest_curve_dict, reactor_name, name_add): '''Converts the dictionaries into dataframes to format for saving as an excel. The total resutls on the first sheet and closest curves on the second''' #creates a dataframe by looping through the dict and appending the df's together. count = 0 print regression_dist_dict for key in regression_dist_dict: if count == 0: total_results = pd.DataFrame(regression_dist_dict[key], index=[key]*len(regression_dist_dict[key]), columns=['reactor', 'enrichment', 'distance']) closest_results = pd.DataFrame([closest_curve_dict[key]], index=[key], columns=['reactor', 'enrichment', 'distance']) count += 1 else: total_results = total_results.append(pd.DataFrame(regression_dist_dict[key], index=[key]*len(regression_dist_dict[key]), columns=['reactor', 'enrichment', 'distance'])) closest_results = closest_results.append(pd.DataFrame([closest_curve_dict[key]], index=[key], columns=['reactor', 'enrichment', 'distance'])) print 'total_results', total_results print 'closest_results', closest_results file_name = 'data/%s_regression_results_%s.xlsx' % ('_'.join(map(str, reactor_name)), name_add) writer = ExcelWriter(file_name) total_results.to_excel(writer, sheet_name='Sheet1') closest_results.to_excel(writer, sheet_name='Sheet2') writer.save()
def build_aggregates(): writer = None years = range(2006,2010) for year in years: yr = str(year) # fname = "Agg_%s.%s" %(str(yr), "xls") simu = SurveySimulation() simu.set_config(year = yr) simu.set_param() simu.set_survey() inflator = get_loyer_inflator(year) simu.inflate_survey({'loyer' : inflator}) simu.compute() agg = Aggregates() agg.set_simulation(simu) agg.compute() if writer is None: writer = ExcelWriter(str(fname_all)) agg.aggr_frame.to_excel(writer, yr, index= False, header= True, float_format="%.2f") print agg.aggr_frame.to_string() del simu del agg import gc gc.collect() writer.save()
def diag_aggregates(): years = ['2006', '2007', '2008', '2009'] df_final = None for yr in years: xls = ExcelFile(fname_all) df = xls.parse(yr, hindex_col= True) cols = [u"Mesure", u"Dépense \n(millions d'€)", u"Bénéficiaires \n(milliers)", u"Dépenses \nréelles \n(millions d'€)", u"Bénéficiaires \nréels \n(milliers)", u"Diff. relative \nDépenses", u"Diff. relative \nBénéficiaires"] selected_cols = [u"Mesure", u"Diff. relative \nDépenses", u"Diff. relative \nBénéficiaires"] df = df[selected_cols] df['year'] = yr df['num'] = range(len(df.index)) df = df.set_index(['num', u'Mesure', 'year']) if df_final is None: df_final = df else: df_final = df_final.append(df, ignore_index=False) # DataFrame.groupby() df_final = df_final.sortlevel(0) print str(fname_all)[:-5]+'_diag.xlsx' writer = ExcelWriter(str(fname_all)[:-5]+'_diag.xlsx') df_final.to_excel(writer, sheet_name="diagnostics", float_format="%.2f") writer.save()
def main(): parser = argparse.ArgumentParser(description = 'Fantasy Data Visualization') parser.add_argument('players', metavar='PLAYER', \ type=int, nargs='*', help='ids of players to display') parser.add_argument('-d', '--display', type=int, \ choices=[10,25,50], default=10, help='number of rows to display') parser.add_argument('-e', '--excel', dest='excel', \ action='store_true', default=False, help='to excel') args = parser.parse_args() show = int(args.display) # number of stats to show stats = pd.DataFrame.from_csv('.cache/res_avg.csv') # write all stats to excel file if (args.excel): writer = ExcelWriter('.cache/res_avg.xlsx') stats.to_excel(writer, 'Sheet1') writer.save() # display plot if len(args.players) > 0: plot(stats=stats, players=args.players) # print short summary print stats.sort_values(by=['avg_2015'], ascending=[False]).head(show)
def save_xls_name(list_dfs, xls_path, sheet_name): '''save function that takes a list as input to name sheets.''' #remove ascii characters from dataframes for saving for df in list_dfs: df.index = remove_non_ascii(df.index) for col in df.columns: df[col] = remove_non_ascii(df[col]) #save the df's to an excel file writer = ExcelWriter(xls_path) for n, df in enumerate(list_dfs): df.to_excel(writer, sheet_name[n]) writer.save() def remove_non_ascii(col): '''remove ascii for saving to excel''' new_index = [] for name in col: try: for letter in name: if ord(letter) > 128: name = name.replace(letter, '') except: pass new_index.append(name) return new_index
def data_total( DocName, HistoryPath, SavePath ): files = os.listdir(HistoryPath) TotalData = pd.DataFrame() for file in files: historyfile = os.path.join(HistoryPath, file) try: HistoryBook = pd.ExcelFile(historyfile) HistorySheet = HistoryBook.parse('Sheet1', skiprows = 0, index = None) TotalData = TotalData.append(HistorySheet) except IOError: print "Cannot read " + str(historyfile) TotalData.dropna(subset = ['ProductID'], inplace = True) TotalData.drop_duplicates(inplace = True) filename = DocName + '.xlsx' filename = os.path.join(SavePath, filename) writer = ExcelWriter(filename) TotalData.to_excel(writer, 'Sheet1', index = False ) writer.save() TotalData.to_csv(os.path.join(SavePath, DocName + '.txt'),sep=';',index=False, encoding = 'utf-8')
def save_table(self, directory = None, filename = None, table_format = None): ''' Saves the table to some format ''' now = datetime.now() if table_format is None: if filename is not None: extension = filename[-4:] if extension == '.xls': table_format = 'xls' elif extension == '.csv': table_format = 'csv' else: table_format = 'xls' if directory is None: directory = "." if filename is None: filename = 'Aggregates_%s.%s' % (now.strftime('%d-%m-%Y'), table_format) fname = os.path.join(directory, filename) try: df = self.aggr_frame if table_format == "xls": writer = ExcelWriter(str(fname)) df.to_excel(writer, "aggregates", index= False, header= True) descr = self.create_description() descr.to_excel(writer, "description", index = False, header=False) writer.save() elif table_format == "csv": df.to_csv(fname, "aggregates", index= False, header = True) except Exception, e: raise Exception("Aggregates: Error saving file", str(e))
def AddSeqComp(mypath): """ Loads TestLogAll.h5 from the specified path, then calls MeasurementGroupTools.AddSeqComp to recalculate seq components using FFT Input: Directory of the measurment campaign, e.g.: "aLabView2" Output: Results1.h5, Results1.pdf in the data subdirs. """ from pandas import HDFStore, ExcelWriter import MeasurementGroupTools as mgt h5logs = HDFStore(mypath + "\\" + 'TestLogsAll.h5') TestLog = h5logs['TestLogsAll'] dirs = TestLog[u'DirName'].unique() for dname in dirs: mysubdirpath = mypath + "\\" + dname print "Processing: " + dname mgt.AddSeqComp(mysubdirpath, TestLog, dname) h5logs.put('TestLogsAll',TestLog) h5logs.close() writer = ExcelWriter(mypath + "\\" + 'TestLogsAll.xlsx') TestLog.to_excel(writer,'TestLogsAll') # the second argument defines sheet name writer.save() return
def dataIO(self, args): """ IO data for possible extension """ writer = ExcelWriter("{}.xlsx".format(args.logFile), engine='xlsxwriter') reportDf = pd.DataFrame() reportDf.to_excel(writer, sheet_name="Reports") contentDf = pd.DataFrame() contentDf.to_excel(writer, sheet_name="Contents") contentSheet = writer.sheets["Contents"] contentSheet.write_string(xl_rowcol_to_cell(self.sheetLinkRow, 0), "link list for all choices and sub refines".format(args.logFile)) self.sheetLinkRow += 1 for dfname in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']: if dfname in self._rawdf.keys(): print("--save raw data for {}".format(dfname)) self._rawdf[dfname].to_excel(writer, "{}".format(dfname)) link_format = writer.book.add_format({'color': 'blue', 'underline': 1}) contentSheet.write_url(xl_rowcol_to_cell(self.sheetLinkRow, 0), "internal:{}!A1".format(dfname), link_format, dfname) self.sheetLinkRow += 1 if dfname in self._rawdf.keys() and dfname in ['2', '3', '5', '6', '8', '9', '11', '12']: self.refine(args, writer, dfname) # Close the Pandas Excel writer and output the Excel file. writer.save()
def build_and_send_email(self, data, options): date = timezone.now().date().strftime('%Y_%m_%d') if 'recipients' in options: print 'yes' recipients = options['recipients'] else: print 'no' recipients = settings.DEFAULT_WEEKLY_RECIPIENTS print 'recipients:', recipients message = EmailMessage(subject='Kikar Hamedina, Weekly Report: %s' % date, body='Kikar Hamedina, Weekly Report: %s.' % date, to=recipients) w = ExcelWriter('Weekly_report_%s.xlsx' % date) for datum in data: # csvfile = StringIO.StringIO() pd.DataFrame.from_dict(datum['content']).to_excel(w, sheet_name=datum['name']) w.save() w.close() # f = open(w.path, 'r', encoding='utf-8') message.attach_file(w.path) message.send()
def generate_response_time_stats(state_name,state_slug): is_state = df_requests['jurisdiction'] == state_name state_req_ids = df_requests.loc[is_state & is_complete,'id'] is_state_msg = df_messages['request_id'].isin(state_req_ids.tolist()) df_state_msgs = df_messages[is_state_msg] msgs_grouped = df_state_msgs.groupby('request_id') msg_resp_times = msgs_grouped.apply(compute_response_time) msg_resp_times_resolved = msg_resp_times[msg_resp_times['status']=='resolved'][['public_body','response_time']] msg_resp_times_resolved['days'] = msg_resp_times_resolved['response_time'].dt.days resp_times_by_pbody = msg_resp_times_resolved.groupby('public_body')['days'].agg({'avg_response_time':np.mean,\ 'n_requests':np.size}) resp_times_by_pbody.sort_values('avg_response_time',ascending=False,inplace=True) resp_times_by_pbody.iloc[0:15]['avg_response_time'].plot(kind='barh') plt.xlabel('Tage') plt.legend([]) plt.title('Durchschnittliche Antwortzeiten') plt.savefig(figures_path+'response_times'+state_slug+'.png', bbox_inches='tight',dpi=120) plt.close() writer = ExcelWriter(files_path + 'response_times_per_pbody'+state_slug+'.xlsx') resp_times_by_pbody.to_excel(writer) writer.save() writer = ExcelWriter(files_path + 'response_times_raw'+state_slug+'.xlsx') msg_resp_times_resolved.to_excel(writer) writer.save()
def to_mem_excel(dataframe, sheet_name='WorkSheet'): iobuffer = BytesIO() writer = ExcelWriter(iobuffer, engine='xlwt') dataframe.to_excel(writer, sheet_name=sheet_name) writer.save() iobuffer.flush() iobuffer.seek(0) return iobuffer.getvalue()
def saveDialog(self): '''Saves the project as an .xls file.''' title = 'Save project as...' fileName,f = QFileDialog.getSaveFileName(self,title,self.path) writer = ExcelWriter(fileName+'.xls') for marker in self.markers: marker.table.to_excel(writer,marker.name) writer.save()
def writeToExcel(fileName=''): print "Writing to Excel File : "+fileName data = {'CVE ID Number': cveIDNumber, 'Summary Text': summaryText, 'Publish Date': publishDate, 'Software Type': softwareType, 'Vendor': vendor,'Product':product,'Version':version,'CVSS Score':cvssScore,'Confidentiality Impact':confidentialityImpact,'Integrity Impact':integrityImpact,'Availibility Impact':availibilityImpact,'Access Complexity':accessComplexity,'Authentication':authentication,'Gained Access':gainedAccess,'Vulnerability Type':vulnType} df = pd.DataFrame(data,columns=['CVE ID Number','Publish Date', 'Software Type','Vendor','Product','Version','CVSS Score','Confidentiality Impact','Integrity Impact','Availibility Impact','Access Complexity','Authentication','Gained Access','Vulnerability Type','Summary Text']) writer = ExcelWriter(fileName) df.to_excel(writer,'CVE Details',index=False) writer.save() print "Completed."
def save_xlsx(list_dfs, xlsx_path): writer = ExcelWriter(xlsx_path) for n, df in enumerate(list_dfs): df.to_excel(writer, '%s' %n) print('Saving %s' %n) writer.save() print('Finished writing to file') return None
def corpus_to_excel(corpus_path, excel_path): '''NB! Make sure to use .xls file extension for Excel files.''' corpus = PyCorpus(corpus_path) writer = ExcelWriter(excel_path) for key in corpus: corpus[key].to_excel(writer, sheet_name=key) writer.save() corpus.close()
def extract_SHT1x_data_day_by_day(SHT1x_dataframe, days_list): # the 'with' statement dont work today = date.today() writer = ExcelWriter('static/data/SHT1x.xlsx') for day in days_list: if day <= today: day_SHT1x = SHT1x_dataframe[str(day)] day_SHT1x.to_excel(writer, sheet_name=str(day)) writer.save()
def save_peaks_excel(peakOnlyHdf5,xlsxFile): dsets = h5py.File(peakOnlyHdf5,'r') writer = ExcelWriter(xlsxFile) for _key in dsets.keys(): dset = dsets[_key] _df = pd.DataFrame(list(dset)) _df.to_excel(writer,_key,header=False, index=False) print(_key+'sheet is created') writer.save() writer.close()
def slmode(sheet, size): writer = ExcelWriter("sw_mode_" + str(size) + "t_" + sheet + ".xlsx") columnas = dfs[str(sheet)].columns # store columns names length = len(dfs[str(sheet)].columns) new_df = pd.DataFrame(dfs[str(sheet)].iloc[:,0]) for i in range(1,length-(size-1)): for j in range(0,(size)): new_df[str(columnas[j+i])] = dfs[str(sheet)].iloc[:,j+i] new_df.to_excel(writer,"set_" + str(i), index=False) new_df = pd.DataFrame(dfs[str(sheet)].iloc[:,0]) writer.save()
def export_to_xls(df, path, format_excel=None, engine='xlsxwriter', send=False): writer = ExcelWriter(path, engine=engine, datetime_format='hh:mm:ss mmm d yyyy', date_format='mmmm dd yyyy') df.to_excel(writer) writer.save() if format_excel: format_excel(path) if send: send_file_by_email(path) else: return download_file(path)
def extract_thermo_data_day_by_day(thermo_dataframe, days_list): # the 'with' statement dont work # replace dont work properly #thermo_dataframe_sustituted = thermo_dataframe.replace({'0': 'OFF', '1': 'ON'}) #print thermo_dataframe_sustituted today = date.today() writer = ExcelWriter('static/data/thermo.xlsx') for day in days_list: if day <= today: day_thermo = thermo_dataframe[str(day)] day_thermo.to_excel(writer, sheet_name=str(day)) writer.save()
def save_xls(self, dframe): # 把数据写到已行业命名的excel文件的名字sheet xls_path = os.path.join(current_folder, '筛选后股票的财务报表', self.hangye) if os.path.exists(xls_path): # excel 文件已经存在 book = load_workbook(xls_path) writer = pd.ExcelWriter(xls_path, engine='openpyxl') writer.book = book writer.sheets = dict((ws.title, ws) for ws in book.worksheets) dframe.to_excel(writer, self.name) writer.save() else: # 文件还不存在 writer = ExcelWriter(xls_path) dframe.to_excel(writer, self.name) writer.save()
def to_excel(self, filename='myfile.xlsx'): """Export informations to a excel file Kargs: filename: string Name of the excel file ex: filename='myfile.xlsx' """ writer = ExcelWriter(filename) self.clfinfo.to_excel(writer,'Classifier') self.statinfo.to_excel(writer,'Statistics') try: self.featinfo.to_excel(writer,'Features') except: warn('Informations about features has been ignored. Run fit()') writer.save()
def to_excel(): DR = data_recording.DataRecorder(db_name="PRIVATE/result.sqlite") sql = "Select * from rep" DR.con.row_factory = sqlite3.Row cursor = DR.con.execute(sql) rows = cursor.fetchall() DF = pd.DataFrame(rows, columns=[item[0] for item in cursor.description]) # nattention : il faut que le chemin existe. writer = ExcelWriter(conf_file.EXPORT_REP+'/'+'fact_excel.xlsx') DF.to_excel(writer, sheet_name='data_fact') writer.save() print("Le fichier a été sauvé dans {}".format(conf_file.EXPORT_REP+'/'+'fact_excel.xlsx'))
def networkset_2_spreadsheet(ntwkset, file_name=None, file_type= 'excel', *args, **kwargs): ''' Write a NetworkSet object to a spreadsheet, for your boss Write the s-parameters of a each network in the networkset to a spreadsheet. If the `excel` file_type is used, then each network, is written to its own sheet, with the sheetname taken from the network `name` attribute. This functions makes use of the pandas module, which in turn makes use of the xlrd module. These are imported during this function Notes ------ The frequency unit used in the spreadsheet is take from `ntwk.frequency.unit` Parameters ----------- ntwkset : :class:`~skrf.networkSet.NetworkSet` object the network to write file_name : str, None the file_name to write. if None, ntwk.name is used. file_type : ['csv','excel','html'] the type of file to write. See pandas.DataFrame.to_??? functions. form : 'db','ma','ri' format to write data, * db = db, deg * ma = mag, deg * ri = real, imag \*args, \*\*kwargs : passed to pandas.DataFrame.to_??? functions. See Also --------- networkset_2_spreadsheet : writes a spreadsheet for many networks ''' from pandas import DataFrame, Series, ExcelWriter # delayed because its not a requirement if ntwkset.name is None and file_name is None: raise(ValueError('Either ntwkset must have name or give a file_name')) if file_type == 'excel': writer = ExcelWriter(file_name) [network_2_spreadsheet(k, writer, sheet_name =k.name, *args, **kwargs) for k in ntwkset] writer.save() else: [network_2_spreadsheet(k,*args, **kwargs) for k in ntwkset]
def main(project_id, dataset_id, table_names): # create a excel writer instance writer = ExcelWriter('bq_sanity_check_reports.xlsx') for table_name in table_names.split(";"): query = 'SELECT * FROM {0}.{1}'.format(dataset_id, table_name) df = convert_gbq_to_df.run(project_id, query) #Summarizing data df_stats = df.describe(include='all').transpose() # write to an excel sheet df_stats.to_excel(writer, sheet_name=table_name) writer.save()
def export(params, path, paramsToGroupBySize, hasCycles): """Formats extracted data and exports to Data.xlsv""" paramToUnit, Files = extractFolder(params, path, paramsToGroupBySize, hasCycles) channelToFiles = groupFilesByChannel(Files) writer = ExcelWriter(path + 'Data.xlsx') # Needed to save multiple sheets # Iterate through channels currentChannelIndex = 1 numOfChannels = len(channelToFiles) for channel in channelToFiles: extractedValues = {p: [] for p in params} names = [] cyclesColumn = [] # Obtain list of values and names from files in channel for File in channelToFiles[channel]: if hasCycles: appendFileInfoCycles(File, params, extractedValues, names, cyclesColumn) else: appendFileInfo(File, params, extractedValues, names) # Create table / DataFrame table = {'{} ({})'.format(p, paramToUnit[p]): extractedValues[p] for p in params} df = DataFrame(table) df.insert(0, 'File Name', names) if hasCycles: df.insert(1, 'Cycle', cyclesColumn) sheet = 'Ch. ' + channel # Add sheets and autofit column dimesntions df.to_excel(writer, sheet_name=sheet, index=False) writer.sheets[sheet].column_dimensions['A'].width = len( max(names, key=len)) # Message print('--Successfully extracted ' 'from {} ({} of {})'.format(sheet, currentChannelIndex, numOfChannels)) currentChannelIndex += 1 # Export writer.save() print('')
def test_all_nets(fold): data = 2 Server = 'shark' if Server == 'DL': parent_path = '/srv/2-lkeb-17-dl01/syousefi/TestCode/EsophagusProject/sythesize_code/' data_path = '/srv/2-lkeb-17-dl01/syousefi/TestCode/EsophagusProject/Data-01/BrainWeb_permutation2_low/' else: parent_path = '/exports/lkeb-hpc/syousefi/Code/ASL_LOG/debug_Log/synth-' + str( fold) + '/' data_path = '/exports/lkeb-hpc/syousefi/Synth_Data/BrainWeb_permutation2_low/' img_name = '' label_name = '' _rd = _read_data(data=data, img_name=img_name, label_name=label_name, dataset_path=data_path) '''read path of the images for train, test, and validation''' train_data, validation_data, test_data = _rd.read_data_path() # parent_path='/srv/2-lkeb-17-dl01/syousefi/TestCode/EsophagusProject/sythesize_code/Log/synth-12/' chckpnt_dir = parent_path + 'unet_checkpoints/' result_path = parent_path + 'results/' if test_vali == 1: test_set = validation_data elif test_vali == 2: test_set = train_data else: test_set = test_data # image=tf.placeholder(tf.float32,shape=[batch_no,patch_window,patch_window,patch_window,1]) # label=tf.placeholder(tf.float32,shape=[batch_no_validation,label_patchs_size,label_patchs_size,label_patchs_size,2]) # loss_coef=tf.placeholder(tf.float32,shape=[batch_no_validation,1,1,1]) # img_row1 = tf.placeholder(tf.float32, shape=[batch_no,patch_window,patch_window,patch_window, 1]) # img_row2 = tf.placeholder(tf.float32, shape=[batch_no,patch_window,patch_window,patch_window, 1]) # img_row3 = tf.placeholder(tf.float32, shape=[batch_no,patch_window,patch_window,patch_window, 1]) # img_row4 = tf.placeholder(tf.float32, shape=[batch_no,patch_window,patch_window,patch_window, 1]) # img_row5 = tf.placeholder(tf.float32, shape=[batch_no,patch_window,patch_window,patch_window, 1]) # img_row6 = tf.placeholder(tf.float32, shape=[batch_no,patch_window,patch_window,patch_window, 1]) # img_row7 = tf.placeholder(tf.float32, shape=[batch_no,patch_window,patch_window,patch_window, 1]) # img_row8 = tf.placeholder(tf.float32, shape=[batch_no,patch_window,patch_window,patch_window, 1]) # # label1 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label2 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label3 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label4 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label5 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label6 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label7 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label8 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label9 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label10 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label11 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label12 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label13 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) # label14 = tf.placeholder(tf.float32, shape=[batch_no,label_patchs_size,label_patchs_size,label_patchs_size, 1]) img_row1 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) img_row2 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) img_row3 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) img_row4 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) img_row5 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) img_row6 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) img_row7 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) img_row8 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label1 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label2 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label3 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label4 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label5 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label6 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label7 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label8 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label9 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label10 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label11 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label12 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label13 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) label14 = tf.placeholder(tf.float32, shape=[None, None, None, None, 1]) is_training = tf.placeholder(tf.bool, name='is_training') input_dim = tf.placeholder(tf.int32, name='input_dim') # ave_huber = tf.placeholder(tf.float32, name='huber') forked_densenet = _forked_densenet() y, img_row1, img_row2, img_row3, img_row4, \ img_row5, img_row6, img_row7, img_row8 = \ forked_densenet.densenet(img_row1=img_row1, img_row2=img_row2, img_row3=img_row3, img_row4=img_row4, img_row5=img_row5, img_row6=img_row6, img_row7=img_row7, img_row8=img_row8, input_dim=input_dim, is_training=is_training) loss_instance = _loss_func() with tf.name_scope('averaged_mean_squared_error'): # [averaged_huber, perf_loss, angio_loss ] = loss_instance.averaged_huber(label1=label1, label2=label2, label3=label3, label4=label4, label5=label5, label6=label6, label7=label7, label8=label8, label9=label9, label10=label10, label11=label11, label12=label12, label13=label13, label14=label14, logit1=y[:, :, :, :, 0, np.newaxis], logit2=y[:, :, :, :, 1, np.newaxis], logit3=y[:, :, :, :, 2, np.newaxis], logit4=y[:, :, :, :, 3, np.newaxis], logit5=y[:, :, :, :, 4, np.newaxis], logit6=y[:, :, :, :, 5, np.newaxis], logit7=y[:, :, :, :, 6, np.newaxis], logit8=y[:, :, :, :, 7, np.newaxis], logit9=y[:, :, :, :, 8, np.newaxis], logit10=y[:, :, :, :, 9, np.newaxis], logit11=y[:, :, :, :, 10, np.newaxis], logit12=y[:, :, :, :, 11, np.newaxis], logit13=y[:, :, :, :, 12, np.newaxis], logit14=y[:, :, :, :, 13, np.newaxis]) cost = tf.reduce_mean(averaged_huber, name="cost") # ======================================================================== ave_loss = tf.placeholder(tf.float32, name='loss') ave_loss_perf = tf.placeholder(tf.float32, name='loss_perf') ave_loss_angio = tf.placeholder(tf.float32, name='loss_angio') average_gradient_perf = tf.placeholder(tf.float32, name='grad_ave_perf') average_gradient_angio = tf.placeholder(tf.float32, name='grad_ave_angio') # restore the model sess = tf.Session() saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(chckpnt_dir) saver.restore(sess, ckpt.model_checkpoint_path) copyfile('./test_synthesizing_net.py', result_path + 'test_synthesizing_net.py') _image_class = image_class(train_data, bunch_of_images_no=1, is_training=1, patch_window=patch_window, sample_no_per_bunch=1, label_patch_size=label_patchs_size, validation_total_sample=0) learning_rate = 1E-5 extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(extra_update_ops): optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost) loss = 0 mse_angio = [] mse_perf = [] test_list = [] time_list = [] for img_indx in range(len(test_set)): crush, noncrush, perf, angio, spacing, direction, origin = _image_class.read_image_for_test( test_set=test_set, img_indx=img_indx, input_size=patch_window, final_layer=final_layer) test_set[img_indx][0][0].split('/') start = time.time() [loss_train1, out] =\ sess.run([cost, y], feed_dict={ img_row1: np.expand_dims(np.expand_dims(crush[0][:, :, :], axis=0), axis=-1), img_row2: np.expand_dims(np.expand_dims(noncrush[1][:, :, :], axis=0), axis=-1), img_row3: np.expand_dims(np.expand_dims(crush[2][:, :, :], axis=0), axis=-1), img_row4: np.expand_dims(np.expand_dims(noncrush[3][:, :, :], axis=0), axis=-1), img_row5: np.expand_dims(np.expand_dims(crush[4][:, :, :], axis=0), axis=-1), img_row6: np.expand_dims(np.expand_dims(noncrush[5][:, :, :], axis=0), axis=-1), img_row7: np.expand_dims(np.expand_dims(crush[6][:, :, :], axis=0), axis=-1), img_row8: np.expand_dims(np.expand_dims(noncrush[7][:, :, :], axis=0), axis=-1), label1: np.expand_dims(np.expand_dims(perf[0], axis=0), axis=-1), label2: np.expand_dims(np.expand_dims(perf[1], axis=0), axis=-1), label3: np.expand_dims(np.expand_dims(perf[2], axis=0), axis=-1), label4: np.expand_dims(np.expand_dims(perf[3], axis=0), axis=-1), label5: np.expand_dims(np.expand_dims(perf[4], axis=0), axis=-1), label6: np.expand_dims(np.expand_dims(perf[5], axis=0), axis=-1), label7: np.expand_dims(np.expand_dims(perf[6], axis=0), axis=-1), label8: np.expand_dims(np.expand_dims(angio[0], axis=0), axis=-1), label9: np.expand_dims(np.expand_dims(angio[1], axis=0), axis=-1), label10: np.expand_dims(np.expand_dims(angio[2], axis=0), axis=-1), label11: np.expand_dims(np.expand_dims(angio[3], axis=0), axis=-1), label12: np.expand_dims(np.expand_dims(angio[4], axis=0), axis=-1), label13: np.expand_dims(np.expand_dims(angio[5], axis=0), axis=-1), label14: np.expand_dims(np.expand_dims(angio[6], axis=0), axis=-1), is_training: False, input_dim: patch_window, ave_loss: -1, ave_loss_perf: -1, ave_loss_angio: -1, average_gradient_perf: -1, average_gradient_angio: -1 }) end = time.time() elapsed_time = end - start for i in range(np.shape(out)[-1]): image = out[0, :, :, :, i] sitk_image = sitk.GetImageFromArray(image) res_dir = test_set[img_indx][0][0].split('/')[-2] if i == 0: os.mkdir(parent_path + 'results/' + res_dir) if i < 7: nm = 'perf' else: nm = 'angi' sitk_image.SetDirection(direction=direction) sitk_image.SetOrigin(origin=origin) sitk_image.SetSpacing(spacing=spacing) sitk.WriteImage( sitk_image, parent_path + 'results/' + res_dir + '/' + nm + '_' + str(i % 7) + '.mha') print(parent_path + 'results/' + res_dir + '/' + nm + '_' + str(i % 7) + '.mha done!') mse_p = 0 mse_a = 0 for i in range(7): if i == 0: os.mkdir(parent_path + 'results/' + res_dir + '/GT/') sitk_angio = sitk.GetImageFromArray(angio[i]) sitk_angio.SetDirection(direction=direction) sitk_angio.SetOrigin(origin=origin) sitk_angio.SetSpacing(spacing=spacing) sitk.WriteImage( sitk_angio, parent_path + 'results/' + res_dir + '/GT/angio_' + str(i) + '.mha') mse_p = np.mean(np.power(out[0, :, :, :, i + 7] - perf[i], 2)) mse_a = np.mean(np.power(out[0, :, :, :, i] - angio[i], 2)) sitk_perf = sitk.GetImageFromArray(perf[i]) sitk_perf.SetDirection(direction=direction) sitk_perf.SetOrigin(origin=origin) sitk_perf.SetSpacing(spacing=spacing) sitk.WriteImage( sitk_perf, parent_path + 'results/' + res_dir + '/GT/perf_' + str(i) + '.mha') # plt.imshow(out[0, int(gt_cube_size / 2), :, :, 0]) # plt.figure() loss += loss_train1 test_list.append(res_dir) mse_angio.append(mse_a) mse_perf.append(mse_p) time_list.append(elapsed_time) print('Loss_train: ', loss_train1) df = pd.DataFrame({ 'test_list': test_list, 'mse_angio': mse_angio, 'mse_perf': mse_perf, 'elapsed_time': time_list }) writer = ExcelWriter(parent_path + 'results/output.xlsx') df.to_excel(writer, 'Sheet1', index=False) writer.save() print('Total loss: ', loss / len(test_set))
def export_csv(self, request): elearning_session = ELearningSession.objects.all() field_names = [ 'id', 'quiz', 'session', 'category', 'sub_category', 'figure', 'content', 'explanation', 'correct', 'answer1', 'answer2', 'answer3' ] id_list = [] elearning_name_list = [] session_list = [] category = [] sub_category = [] figure_list = [] content = [] explanation_list = [] correct = [] answer1 = [] answer2 = [] answer3 = [] count = 1 for q in elearning_session: try: for slide in q.slides.all().values(): # if q.elearning in elearning_name_list: # session_list.append(q.number) # else: # count = 1 session_list.append(q.number) id_list.append(count) elearning_name_list.append(q.elearning) category.append("n") sub_category.append("n") figure_list.append(slide['image']) content.append("n") explanation_list.append("n") correct.append("n") answer1.append("n") answer2.append("n") answer3.append("n") for question in q.questions.all().values(): id_list.append(count) # if q.elearning in elearning_name_list: # session_list.append(count) # else: # count = 1 session_list.append(q.number) elearning_name_list.append(q.elearning) category.append(question['category']) sub_category.append(question['sub_category']) figure_list.append("n") content.append(question['text']) explanation_list.append(question['explanation']) correct_answer = Answer.objects.filter( question=question['id'], correct=True).values_list('text') other_answers = Answer.objects.filter( question=question['id'], correct=False).values_list('text') correct.append(correct_answer[0][0]) try: answer1.append(other_answers[0][0]) except: answer1.append("") try: answer2.append(other_answers[1][0]) except: answer2.append("") try: answer3.append(other_answers[2][0]) except: answer3.append("") except: continue else: count += 1 data = { 'id': id_list, 'quiz': elearning_name_list, 'session': session_list, 'category': category, 'sub_category': sub_category, 'figure': figure_list, 'content': content, 'explanation': explanation_list, 'correct': correct, 'answer1': answer1, 'answer2': answer2, 'answer3': answer3 } df = pandas.DataFrame(data, columns=field_names) df = df.dropna() writer = ExcelWriter('Elearning-db.xlsx') df.to_excel(writer, 'Elearning', index=False) writer.save() path = "Elearning-db.xlsx" if os.path.exists(path): with open(path, "rb") as excel: data = excel.read() response = HttpResponse(data, content_type='application/vnd.ms-excel') response[ 'Content-Disposition'] = 'attachment; filename="db_elearning.xlsx"' return response
def write_to_excel(): writer = ExcelWriter('Books.xlsx') df.to_excel(writer, 'Denes') writer.save()
def networkset_2_spreadsheet(ntwkset: 'NetworkSet', file_name: str = None, file_type: str = 'excel', *args, **kwargs): ''' Write a NetworkSet object to a spreadsheet, for your boss Write the s-parameters of a each network in the networkset to a spreadsheet. If the `excel` file_type is used, then each network, is written to its own sheet, with the sheetname taken from the network `name` attribute. This functions makes use of the pandas module, which in turn makes use of the xlrd module. These are imported during this function Notes ------ The frequency unit used in the spreadsheet is take from `ntwk.frequency.unit` Parameters ----------- ntwkset : :class:`~skrf.networkSet.NetworkSet` object the network to write file_name : str, None the file_name to write. if None, ntwk.name is used. file_type : ['csv','excel','html'] the type of file to write. See pandas.DataFrame.to_??? functions. form : 'db','ma','ri' format to write data, * db = db, deg * ma = mag, deg * ri = real, imag \*args, \*\*kwargs : passed to pandas.DataFrame.to_??? functions. See Also --------- networkset_2_spreadsheet : writes a spreadsheet for many networks ''' from pandas import DataFrame, Series, ExcelWriter # delayed because its not a requirement if ntwkset.name is None and file_name is None: raise (ValueError('Either ntwkset must have name or give a file_name')) if file_name is None: file_name = ntwkset.name if file_type == 'excel': # add file extension if missing if not file_name.endswith('.xlsx'): file_name += '.xlsx' writer = ExcelWriter(file_name) [ network_2_spreadsheet(k, writer, sheet_name=k.name, *args, **kwargs) for k in ntwkset ] writer.save() else: [network_2_spreadsheet(k, *args, **kwargs) for k in ntwkset]
def main_f(): xl1 = pd.ExcelFile("rockpaperseaser_new1.xlsx") df = xl1.parse("Sheet1") df = df.replace('r', 1) df = df.replace('p', 2) df = df.replace('s', 3) df[[]] out_priority1 = '' if (df['Result'][len(df) - 1] == 'loss' and df['Result'][len(df) - 2] == 'loss'): i = random.randint(1, 3) if (i == 1): out_priority1 = 'r' elif (i == 2): out_priority1 = 'p' else: out_priority1 = 's' out_priority2 = '' if (df['my pick'][len(df) - 1] == df['my pick'][len(df) - 2]): if (df['my pick'][len(df) - 1] == 1): out_priority2 = 's' elif (df['my pick'][len(df) - 1] == 2): out_priority2 = 'r' else: out_priority2 = 'p' #df = pd.read_xlsx(file_name) l_d = [] for i in range(4, len(df)): l_d.append([ df['my pick'][i], df['my pick'][i - 1], df['my pick'][i - 2], df['my pick'][i - 3], df['my pick'][i - 4], df['computer pick'][i], df['computer pick'][i - 1], df['computer pick'][i - 2], df['computer pick'][i - 3], df['computer pick'][i - 4] ]) df1 = pd.DataFrame(l_d, columns=[ 'my_pick', 'my_p_1', 'my_p_2', 'my_p_3', 'my_p_4', 'computer_pick', 'com_p_1', 'com_p_2', 'com_p_3', 'com_p_4' ]) df1['p_pred'] = (df1['computer_pick'] == 2) * 1 df1['r_pred'] = (df1['computer_pick'] == 1) * 1 df1['s_pred'] = (df1['computer_pick'] == 3) * 1 df_X = df1.drop( { 'p_pred', 'r_pred', 's_pred', 'my_pick', 'computer_pick', 'my_p_4', 'com_p_4', 'my_p_3', 'com_p_3' }, axis=1) # 2. FIT # 3. Transform onehotlabels = enc.transform(df_X).toarray() new_data = onehotlabels[onehotlabels.shape[0] - 1] p_pred = rf_p.predict(new_data.reshape(1, -1)) r_pred = rf_r.predict(new_data.reshape(1, -1)) s_pred = rf_s.predict(new_data.reshape(1, -1)) #adding randomness in draw cond_prob = p_pred + r_pred + s_pred x = uniform(low=0, high=1, size=1) sel = [ p_pred / cond_prob, (p_pred + r_pred) / cond_prob, (p_pred + r_pred + s_pred) / cond_prob ] out_priority3 = '' if (x < sel[0]): out_priority3 = 's' elif (x < sel[1]): out_priority3 = 'p' else: out_priority3 = 'r' my = '' if (out_priority1 != ''): print('play with ' + out_priority1) my = out_priority1 elif (out_priority2 != ''): print('play with ' + out_priority2) my = out_priority2 else: print('play with ' + out_priority3) my = out_priority3 #my = input('What did you play?(r,p,s) ') computer = input('What did the AI play?(r,p,s) ') res = input('Did you win(y/n/t) ') result = '' if (res == 'y'): result = 'win' elif (res == 'n'): result = 'loss' else: result = 'tie' usr = [[my, computer, result]] usr_input = pd.DataFrame(usr, columns=['my pick', 'computer pick', 'Result']) final = xl1.parse("Sheet1") final = final.append(usr_input) #final.reset_index(drop=True) writer = ExcelWriter('rockpaperseaser_new1.xlsx') final.to_excel(writer, 'Sheet1', index=False) writer.save()
def time_write_excel(self, engine): bio_write = BytesIO() bio_write.seek(0) writer_write = ExcelWriter(bio_write, engine=engine) self.df.to_excel(writer_write, sheet_name="Sheet1") writer_write.save()
def ModifyInputfiles(): def get_excel_files(path): return [f for f in listdir(path) if isfile(join(path, f))] dirpath = '/home/airflow/gcs/data/' print("Dirpath:" + dirpath) if not os.path.exists(os.path.join(dirpath, 'in-data')): os.mkdir(os.path.join(dirpath, 'in-data')) OutputFolder = os.path.join(dirpath, 'in-data') inputfilepath = '/home/airflow/gcs/data/input' mappingFilePath = '/home/airflow/gcs/data/map' mapping = get_excel_files(mappingFilePath) inputfilelist = get_excel_files(inputfilepath) for map_file in mapping: #if in future we need to take multiple files print("running file:" + map_file) mapping_exten = map_file.split('.')[1] if mapping_exten == 'csv': mapping_df = pd.read_csv(os.path.join(mappingFilePath, map_file), sep=';') elif mapping_exten == 'xlsx': mapping_df = pd.read_excel(os.path.join(mappingFilePath, map_file)) mapping_df = mapping_df.loc[:, mapping_df.columns.intersection([ 'VM', 'VNF Name', 'VNF Program name', 'VNF vendor name', 'Site Name' ])] mapping_df = mapping_df.rename(columns={ 'VNF Program name': 'Program Name', 'VNF vendor name': 'Vendor' }) writer = ExcelWriter( os.path.join(OutputFolder, 'FS_Input_' + time.strftime("%Y_%m_%d") + '.xlsx')) final_input_df = pd.DataFrame() for input in inputfilelist: #if in future we need to take multiple files print("Running file: " + input) input_exten = input.split('.')[1] file_name = input.split('.')[0] site_name = file_name.split('_')[0] if input_exten == 'csv': input_df = pd.read_csv(os.path.join(inputfilepath, input), sep=';') elif input_exten == 'xlsx': input_df = pd.read_excel(os.path.join(inputfilepath, input)) input_df = input_df.rename(columns=lambda x: x.strip()) input_df['Disk Average Read Bytes Per Second (kB/s)'] = pd.to_numeric( input_df['Disk Average Read Bytes Per Second (kB/s)'].astype( str).replace('.', ''), errors='coerce') input_df['Disk Average Read Bytes Per Second (kB/s)'] = pd.to_numeric( input_df['Disk Average Read Bytes Per Second (kB/s)'], downcast="float") input_df['Disk Average Write Bytes Per Second (kB/s)'] = pd.to_numeric( input_df['Disk Average Write Bytes Per Second (kB/s)'].astype( str).replace('.', ''), errors='coerce') input_df['Disk Average Write Bytes Per Second (kB/s)'] = pd.to_numeric( input_df['Disk Average Write Bytes Per Second (kB/s)'], downcast="float") #input_df['Site Name'] = site_name input_df = input_df.rename(columns=lambda x: x.lower()) final_input_df = final_input_df.append(input_df) final_input_df = final_input_df.rename( columns={ 'site name': 'SiteName', 'start time': 'timestamp', 'vm': 'VM', 'cpu usage %': 'CPU usage (%)', 'memory usage %': 'Memory Usage (%)', 'disk usage %': 'Disk Usage (%)', 'disk average read bytes per second (kb/s)': 'Disk Average Read Bytes Per Second (kB/s)', 'disk average write bytes per second (kb/s)': 'Disk Average Write Bytes Per Second (kB/s)' }) final_input_df = pd.merge(final_input_df, mapping_df, on='VM', how='left') final_input_df['Disk Usage (%)'] = final_input_df[ 'Disk Usage (%)'].convert_objects(convert_numeric=True) final_input_df['Memory Usage (%)'] = final_input_df[ 'Memory Usage (%)'].convert_objects(convert_numeric=True) final_input_df['CPU usage (%)'] = final_input_df[ 'CPU usage (%)'].convert_objects(convert_numeric=True) final_input_df['Disk Usage (%)'] = np.round( final_input_df['Disk Usage (%)'].astype(float), 3) final_input_df['Memory Usage (%)'] = np.round( final_input_df['Memory Usage (%)'].astype(float), 3) final_input_df['CPU usage (%)'] = np.round( final_input_df['CPU usage (%)'].astype(float), 3) final_input_df.to_excel(writer, index=False) writer.save() writer.close() print('end of process')
def write_xlsx(df, name_file): writer = ExcelWriter(f'{name_file}.xlsx') df.to_excel(writer, 'Sheet1') writer.save() return 'ФАЙЛ СОХРАНЕН'
def extract_expression(tumor, platform, gencode_version): """ The EXTRACT_EXPRESSION operation extracts expression values from TCGA for all the genes of interest and their candidate regulatory genes. Intermediate results files are exported locally during the execution of the function, while the final dataframes are returned as Pandas dataframes and exported locally in the Excel files 'Gene Expression - InterestGenes.xlsx' and 'Gene Expression - RegulatoryGenes.xlsx'. :param tumor: full name of the tumor of interest, encoded as a string (e.g. 'Ovarian Serous Cystadenocarcinoma', 'Breast Invasive Carcinoma', ...) :param platform: number identifying the sequencing platform (either 27 for the 27k probes sequencing platform or 450 for the 450k probes sequencing platform) :param gencode_version: number representing the GENCODE genomic annotations to use (currently, for assembly GRCh38, versions 22, 24 and 27 can be used) :return: two Pandas dataframes Example:: import genereg as gr expr_interest_df, expr_regul_df = gr.GeneExpression.extract_expression(tumor='Ovarian Serous Cystadenocarcinoma', platform=27, gencode_version=22) """ # Check input parameters tcga_tumors = ["Acute Myeloid Leukemia","Adrenocortical Carcinoma","Bladder Urothelial Carcinoma","Brain Lower Grade Glioma" ,"Breast Invasive Carcinoma","Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma","Cholangiocarcinoma","Colon Adenocarcinoma","Esophageal Carcinoma","Glioblastoma Multiforme","Head and Neck Squamous Cell Carcinoma","Kidney Chromophobe","Kidney Renal Clear Cell Carcinoma","Kidney Renal Papillary Cell Carcinoma","Liver Hepatocellular Carcinoma","Lung Adenocarcinoma","Lung Squamous Cell Carcinoma","Lymphoid Neoplasm Diffuse Large B-cell Lymphoma","Mesothelioma","Ovarian Serous Cystadenocarcinoma","Pancreatic Adenocarcinoma","Pheochromocytoma and Paraganglioma","Prostate Adenocarcinoma","Rectum Adenocarcinoma","Sarcoma","Skin Cutaneous Melanoma","Stomach Adenocarcinoma","Testicular Germ Cell Tumors","Thymoma","Thyroid Carcinoma","Uterine Carcinosarcoma","Uterine Corpus Endometrial Carcinoma","Uveal Melanoma"] if tumor not in tcga_tumors: raise ValueError('PATHOLOGY NOT SUPPORTED! You can analyze one of these 33 types of TCGA tumors: '+(', '.join(tcga_tumors))) if platform not in [27, 450]: raise ValueError('PLATFORM NOT RECOGNIZED! Sequencing platforms available: 27 and 450') if gencode_version not in [22, 24, 27]: raise ValueError('GRCh38 GENCODE versions available are 22, 24 and 27') # Load the list of genes of interest EntrezConversion_df = pd.read_excel('./Genes_of_Interest.xlsx',sheetname='Sheet1',header=0,converters={'GENE_SYMBOL':str,'ENTREZ_GENE_ID':str,'GENE_SET':str}) # Create a list containing the Gene Symbols of the genes of interest genesSYM_of_interest = [] for i, r in EntrezConversion_df.iterrows(): sym = r['GENE_SYMBOL'] if sym not in genesSYM_of_interest: genesSYM_of_interest.append(sym) # Import the dictionary of genes of interest with their candidate regulatory genes dict_RegulGenes = pickle.load(open('./2_Regulatory_Genes/dict_RegulGenes.p', 'rb')) # Import the gene-TFs mapping dataframe Mapping_df = pd.read_excel('./0_Genes_Mapping/Genes Mapping.xlsx',sheetname='Sheet1',header=0,converters={'ENTREZ_GENE_ID':str,'HGNC_ID':str}) # Create a list containing the Gene Symbols of the regulatory genes of genes of interest regulatory_genesSYM = [] for key, value in dict_RegulGenes.items(): for gene in value: if gene not in regulatory_genesSYM: regulatory_genesSYM.append(gene) # Extract the list of distinct Gene Symbols mapped in the mapping table mapped_gene_SYMs = [] for index, row in Mapping_df.iterrows(): sym = row['GENE_SYMBOL'] if sym not in mapped_gene_SYMs: mapped_gene_SYMs.append(sym) # Execute the query for the extraction of gene expression values on the remote server, using the PyGMQL Python library gl.set_remote_address('http://gmql.eu/gmql-rest/') gl.login() gl.set_mode('remote') # Load the TCGA datasets to be used in the query methylation_dataset = gl.load_from_remote(remote_name='GRCh38_TCGA_methylation', owner='public') expression_dataset = gl.load_from_remote(remote_name='GRCh38_TCGA_gene_expression', owner='public') # Identify the sequencing platform to be used if platform == 27: seq_platform = 'Illumina Human Methylation 27' elif platform == 450: seq_platform = 'Illumina Human Methylation 450' # Extract all the samples for the current tumor and platform all_methyl = methylation_dataset.meta_select((methylation_dataset['manually_curated__cases__disease_type'] == tumor) & (methylation_dataset['manually_curated__platform'] == seq_platform) & ((methylation_dataset['biospecimen__bio__sample_type'] == 'Primary Tumor') | (methylation_dataset['biospecimen__bio__sample_type'] == 'Recurrent Tumor')) & (methylation_dataset['clinical__shared__history_of_neoadjuvant_treatment'] == 'No')) all_expr = expression_dataset.meta_select((expression_dataset['manually_curated__cases__disease_type'] == tumor) & ((expression_dataset['biospecimen__bio__sample_type'] == 'Primary Tumor') | (expression_dataset['biospecimen__bio__sample_type'] == 'Recurrent Tumor')) & (expression_dataset['clinical__shared__history_of_neoadjuvant_treatment'] == 'No')) # Gene Expression: expr_0 = all_expr.reg_project(field_list=['ensembl_gene_id','entrez_gene_id','gene_symbol','fpkm']) expr = expr_0.meta_select(semiJoinDataset=all_methyl, semiJoinMeta=['biospecimen__bio__bcr_sample_barcode']) # Materialize the results into a GDataframe expr_Gdf = expr.materialize('./(MaterializeResults)') # The result dataset is loaded as a GDataframe, an object containing two pandas dataframes, one for the region data and one for the metadata. # Get the two pandas dataframes: expr_df_regs = expr_Gdf.regs expr_df_meta = expr_Gdf.meta n_regs = len(expr_df_regs) n_samples = len(expr_df_meta) # Rename 'chr', 'start', and 'stop' columns header expr_df_regs.rename(columns={'chr':'chrom','start':'left','stop':'right'}, inplace=True) # Change index into progressive integer numbers and store the name of the sample in another column expr_df_regs['sample_id'] = expr_df_regs.index expr_df_regs.index = range(n_regs) # Convert unknown values (NaN) to empty strings expr_df_regs = expr_df_regs.fillna('') # Convert all the metadata values into strings, since they're encode as lists in Python col_names = [] for name, values in expr_df_meta.iteritems(): col_names.append(name) for index, row in expr_df_meta.iterrows(): for c in col_names: list_val = row[c] # it's encoded as a list str_val = ''.join(list_val) # convert the value stored as a list in a string expr_df_meta.set_value(index,c,str_val) # Since we have to extract the expression values for each distinct sample barcode (aliquot), we create a list containing these distinct identifiers expr_sample_barcodes_all = [] for index, row in expr_df_meta.iterrows(): barcode = row['biospecimen__bio__bcr_sample_barcode'] if barcode not in expr_sample_barcodes_all: # get distinct values expr_sample_barcodes_all.append(barcode) # Check which are repeated aliquots, if present all_aliqouts = [] for index, row in expr_df_meta.iterrows(): barcode = row['biospecimen__bio__bcr_sample_barcode'] all_aliqouts.append(barcode) multiple_aliquots = [item for item, count in collections.Counter(all_aliqouts).items() if count > 1] samples_to_remove = [] expr_sample_barcodes = [] if len(multiple_aliquots) != 0: # Among the repeated aliquots, keep only the most recent ones (of 2013) for index, row in expr_df_meta.iterrows(): year = row['biospecimen__bio__year_of_shipment'] barcode = row['biospecimen__bio__bcr_sample_barcode'] if (barcode in multiple_aliquots) and year == '2011': expr_df_meta.drop(index, inplace=True) samples_to_remove.append(index) # Import the list of aliquots in the methylation dataset text_file = open(common_aliquots, 'r') aliquots = text_file.read().split('\n') aliquots.remove('') text_file.close() # Extract the new list of distinct TCGA Aliquots to extract for index, row in expr_df_meta.iterrows(): barcode = row['biospecimen__bio__bcr_sample_barcode'] if barcode in aliquots: if barcode not in expr_sample_barcodes: expr_sample_barcodes.append(barcode) else: expr_df_meta.drop(index, inplace=True) samples_to_remove.append(index) # Remove regions that corresponded to eliminated repeated aliquots expr_df_regs = expr_df_regs.loc[~(expr_df_regs['sample_id'].isin(samples_to_remove))].copy() else: expr_sample_barcodes = expr_sample_barcodes_all # Export the metadata dataframe setting the TCGA aliquots as indexes. Metadata_df = expr_df_meta.copy() Metadata_df['id_sample'] = Metadata_df.index Metadata_df.set_index('biospecimen__bio__bcr_sample_barcode', inplace=True) writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/EXPR (Metadata).xlsx') Metadata_df.to_excel(writer,'Sheet1') writer.save() # Extract from the expression dataset all the regions that belong to genes of interest expr_df_regs_interest = expr_df_regs.loc[expr_df_regs['gene_symbol'].isin(genesSYM_of_interest)].copy() # Extract from the expression dataset all the regions that belong to regulatory genes of genes of interest expr_df_regs_regulatory = expr_df_regs.loc[expr_df_regs['gene_symbol'].isin(regulatory_genesSYM)].copy() # Gene expression values for each gene of interest: # Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA from collections import defaultdict dict_expr_interest = defaultdict(dict) for key, value in dict_expr_interest.items(): value = defaultdict(list) # The main dictionary has the Gene Symbols of the genes of interest as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values. # The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot. # Set the Gene Symbol as keys of the main dictionary for name in genesSYM_of_interest: dict_expr_interest[name] = {} # Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes) for sample in expr_sample_barcodes: for k, v in dict_expr_interest.items(): v[sample] = [] # Set the values by appending the expression values for each gene of interest: these expression values (fpkm) can be found in the 'expr_df_regs_interest' dataframe for index, row in expr_df_regs_interest.iterrows(): # iterating along the whole dataframe sym = row['gene_symbol'] # get the Gene Symbol of the gene fpkm = row['fpkm'] # get the gene expression value sample = row['sample_id'] # get the name of the sample # get the aliquot corresponding to current sample aliq = expr_df_meta.get_value(sample, 'biospecimen__bio__bcr_sample_barcode') # add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers, dict_expr_interest[sym][aliq].append(round(float(fpkm),6)) # Convert the nested dictionary also into a dataframe # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct genes of interest expr_interest_df1 = pd.DataFrame(index = expr_sample_barcodes, columns = [genesSYM_of_interest]) # Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it expr_interest_df2 = pd.DataFrame(index = expr_sample_barcodes, columns = ['Sample_ID','Tumor','Patient_ID']) # Create the final dataframe expr_interest_df = expr_interest_df1.join(expr_interest_df2) # Fill the previously created dataframe with the correct gene expression values, for each gene of interest and for each TCGA aliquot for gene_sym, dict_value in dict_expr_interest.items(): for tcga_aliq, exp_list in dict_value.items(): if (len(exp_list) != 0): fpkm = exp_list[0] # add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers expr_interest_df.set_value(tcga_aliq,gene_sym,round(fpkm,6)) # Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot for index, row in expr_df_meta.iterrows(): aliquot = row['biospecimen__bio__bcr_sample_barcode'] tumor_tag = row['clinical__admin__disease_code'] patient_id = row['clinical__shared__patient_id'] expr_interest_df.set_value(aliquot,'Sample_ID',index) expr_interest_df.set_value(aliquot,'Tumor',tumor_tag) expr_interest_df.set_value(aliquot,'Patient_ID',patient_id) # Add a row at the beginning of the dataframe to insert also the Entrez Gene ID of each gene of interest additional_index = ['ENTREZ_GENE_ID'] expr_interest_df0_1 = pd.DataFrame(index = additional_index, columns = [genesSYM_of_interest]) expr_interest_df0_2 = pd.DataFrame(index = additional_index, columns = ['Sample_ID','Tumor','Patient_ID']) expr_interest_df0 = expr_interest_df0_1.join(expr_interest_df0_2) frames = [expr_interest_df0, expr_interest_df] expr_interest_df = pd.concat(frames) # Add for each Gene Symbol of our genes of interest the corresponding Entrez Gene ID in the first row of the dataframe for i, r in EntrezConversion_df.iterrows(): entrez_id = r['ENTREZ_GENE_ID'] gene_name = r['GENE_SYMBOL'] expr_interest_df.set_value('ENTREZ_GENE_ID',gene_name,entrez_id) # Set empty strings for NaN values in the 'GENE_SYMBOL' row expr_interest_df.set_value('ENTREZ_GENE_ID','Sample_ID',"") expr_interest_df.set_value('ENTREZ_GENE_ID','Tumor',"") expr_interest_df.set_value('ENTREZ_GENE_ID','Patient_ID',"") # Export the dataframe with the gene expression values for our genes of interest for each TCGA aliquot writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/Gene Expression - InterestGenes.xlsx') expr_interest_df.to_excel(writer,'Sheet1') writer.save() # Gene expression values for each candidate regulatory gene of the genes of interest: # Create a dictionary for storing all the gene expression values for each gene of interest and for each aliquot TCGA from collections import defaultdict dict_expr_regulatory = defaultdict(dict) for key, value in dict_expr_regulatory.items(): value = defaultdict(list) # The main dictionary has the Gene Symbols of the candidate regulatory genes as keys and each gene has another dictionary as value, which, in turn, has the different aliquots as keys and lists as values. # The idea is having a list, containing all the fpkm values, for each gene in each TCGA aliquot. # Set the Gene Symbols as keys of the main dictionary for name in regulatory_genesSYM: dict_expr_regulatory[name] = {} # Set the names of the samples barcodes as keys for each dictionary set as value of a specific key (genes) for sample in expr_sample_barcodes: for k, v in dict_expr_regulatory.items(): v[sample] = [] # Set the values by appending the expression values for each candidate regulatory gene: these expression values (fpkm) can be found in the "expr_df_regs_regulatory" dataframe for index, row in expr_df_regs_regulatory.iterrows(): # iterating along the whole dataframe sym = row['gene_symbol'] # get the Gene Symbol of the gene ens_id = row['ensembl_gene_id'] # get the Ensembl Gene ID fpkm = row['fpkm'] # get the gene expression value sample = row['sample_id'] # get the name of the sample # get the aliquot corresponding to current sample aliq = expr_df_meta.get_value(sample, 'biospecimen__bio__bcr_sample_barcode') # add the value according to the correct gene ID and TCGA aliquot, rounding it to a float with maximum 6 decimal numbers if (gencode_version == 22): if (ens_id not in ['ENSG00000277726.3','ENSG00000275895.3','ENSGR0000214717.8']): dict_expr_regulatory[sym][aliq].append(round(float(fpkm),6)) else: dict_expr_regulatory[sym][aliq].append(round(float(fpkm),6)) # Convert the nested dictionary also into a dataframe # Create a dataframe whose row indexes are the different TCGA samples and the columns are the distinct candidate regulatory genes expr_regulatory_df1 = pd.DataFrame(index = expr_sample_barcodes, columns = [regulatory_genesSYM]) # Add three additional columns for the name of the sample and the ID and barcode of the patient corresponding to each aliquot, in order to have them available if we will need it expr_regulatory_df2 = pd.DataFrame(index = expr_sample_barcodes, columns = ['Sample_ID','Tumor','Patient_ID']) # Create the final dataframe expr_regulatory_df = expr_regulatory_df1.join(expr_regulatory_df2) # Fill the previously created dataframe with the correct gene expression values, for each candidate regulatory gene and for each TCGA aliquot for gene_sym, dict_value in dict_expr_regulatory.items(): for tcga_aliq, exp_list in dict_value.items(): if (len(exp_list) != 0): fpkm = exp_list[0] # add the expression value in the proper cell of the dataframe, rounding it to a float with maximum 6 decimal numbers expr_regulatory_df.set_value(tcga_aliq,gene_sym,round(fpkm,6)) # Add to the dataframe the name of each sample, the tumor code and the patient's ID in correspondence of each TCGA aliquot for index, row in expr_df_meta.iterrows(): aliquot = row['biospecimen__bio__bcr_sample_barcode'] tumor_tag = row['clinical__admin__disease_code'] patient_id = row['clinical__shared__patient_id'] expr_regulatory_df.set_value(aliquot,'Sample_ID',index) expr_regulatory_df.set_value(aliquot,'Tumor',tumor_tag) expr_regulatory_df.set_value(aliquot,'Patient_ID',patient_id) # Add a row at the beginning of the dataframe to insert also the Gene Symbols of each gene of interest additional_index = ['ENTREZ_GENE_ID'] expr_regulatory_df0_1 = pd.DataFrame(index = additional_index, columns = [regulatory_genesSYM]) expr_regulatory_df0_2 = pd.DataFrame(index = additional_index, columns = ['Sample_ID','Tumor','Patient_ID']) expr_regulatory_df0 = expr_regulatory_df0_1.join(expr_regulatory_df0_2) frames = [expr_regulatory_df0, expr_regulatory_df] expr_regulatory_df = pd.concat(frames) # Add for each Gene Symbol of the regulatory genes the corresponding Entrez Gene ID in the first row of the dataframe for i in regulatory_genesSYM: if i == 'PTRF': entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == 'CAVIN1', 'ENTREZ_GENE_ID'].iloc[0] else: entrez_id = Mapping_df.loc[Mapping_df['GENE_SYMBOL'] == i, 'ENTREZ_GENE_ID'].iloc[0] expr_regulatory_df.set_value('ENTREZ_GENE_ID',i,entrez_id) # Set empty strings for NaN values in the 'GENE_SYMBOL' row expr_regulatory_df.set_value('ENTREZ_GENE_ID','Sample_ID',"") expr_regulatory_df.set_value('ENTREZ_GENE_ID','Tumor',"") expr_regulatory_df.set_value('ENTREZ_GENE_ID','Patient_ID',"") # Export the dataframe with the gene expression values for the regulatory genes of our genes of interest for each TCGA aliquot writer = ExcelWriter('./3_TCGA_Data/Gene_Expression/Gene Expression - RegulatoryGenes.xlsx') expr_regulatory_df.to_excel(writer,'Sheet1') writer.save() return expr_interest_df, expr_regulatory_df
def parse_func(path, filename): transactions = mt940.parse(path) # with open('combined.json', 'w') as json_file: # json.dump(transactions, json_file) # with open('combined.json') as f: # d = json.load(f) d = transactions df = json_normalize(transactions) df2 = (pd.concat( {i: json_normalize(x) for i, x in df.pop('transactions').items()}, sort=False).reset_index(level=1, drop=True).join( df, lsuffix='_in_transactions', rsuffix='_if_opening_NA').reset_index(drop=True)) try: df2 = df2[[ 'account_identification', 'date', 'amount.currency', 'amount.amount', 'status', 'customer_reference', 'transaction_reference', 'extra_details', 'transaction_details', 'final_opening_balance.date', 'final_opening_balance.status', 'final_opening_balance.amount.amount', 'final_opening_balance.amount.currency', 'entry_date', 'funds_code', 'guessed_entry_date', 'id', 'available_balance.date', 'available_balance.status', 'available_balance.amount.amount', 'available_balance.amount.currency', 'final_closing_balance.date', 'final_closing_balance.status', 'final_closing_balance.amount.amount', 'final_closing_balance.amount.currency', 'sequence_number', 'statement_number' ]] df2 = df2.rename( columns={ 'account_identification': 'Bank account no.', 'date': 'Transacton date', 'amount.currency': 'Amount currency', 'amount.amount': 'Amount', 'status': 'Transaction type', 'customer_reference': 'Transaction reference', 'transaction_reference': 'Reference no.', 'extra_details': 'Additional reference', 'transaction_details': 'Remarks', 'final_opening_balance.date': 'Opening balance date', 'final_opening_balance.status': 'Opening balance status', 'final_opening_balance.amount.amount': 'Opening balance amount', 'final_opening_balance.amount.currency': 'Opening balance currency', 'entry_date': 'Entry date', 'funds_code': 'Fund code', 'guessed_entry_date': 'Addl. Entry date', 'id': 'ID', 'available_balance.date': 'Available balance date', 'available_balance.status': 'Available balance type', 'available_balance.amount.amount': 'Available balance', 'available_balance.amount.currency': 'Available balance currency', 'final_closing_balance.date': 'Ledger balance date', 'final_closing_balance.status': 'Ledger balance type', 'final_closing_balance.amount.amount': 'Ledger balance amount', 'final_closing_balance.amount.currency': 'Ledger balance currency', 'sequence_number': 'Sequence no.', 'statement_number': 'Statement no.' }) except: pass #print(df2.head(3)) #print(df2.shape) writer = ExcelWriter(filename) df2.to_excel(writer, sheet_name='Transactional_Data', index=False) writer.save() try: d[:] = [item for item in d if not item['transactions']] df = json_normalize(d) df = df[[ 'account_identification', 'final_opening_balance.date', 'final_opening_balance.status', 'final_opening_balance.amount.amount', 'final_opening_balance.amount.currency', 'final_closing_balance.date', 'final_closing_balance.status', 'final_closing_balance.amount.amount', 'final_closing_balance.amount.currency', 'available_balance.date', 'available_balance.status', 'available_balance.amount.amount', 'available_balance.amount.currency', 'sequence_number', 'statement_number', 'transaction_reference' ]] df = df.rename( columns={ 'account_identification': 'Bank account no.', 'final_opening_balance.date': 'Opening balance date', 'final_opening_balance.status': 'Opening balance status', 'final_opening_balance.amount.amount': 'Opening balance amount', 'final_opening_balance.amount.currency': 'Opening balance currency', 'final_closing_balance.date': 'Ledger balance date', 'final_closing_balance.status': 'Ledger balance type', 'final_closing_balance.amount.amount': 'Ledger balance amount', 'final_closing_balance.amount.currency': 'Ledger balance currency', 'available_balance.date': 'Available balance date', 'available_balance.status': 'Available balance type', 'available_balance.amount.amount': 'Available balance', 'available_balance.amount.currency': 'Available balance currency', 'sequence_number': 'Sequence no.', 'statement_number': 'Statement no.', 'transaction_reference': 'Transaction reference' }) # del df['transactions'] # # print(df.head(3)) # print(df.shape) df.to_excel(writer, sheet_name='Non_Transactional_Data', index=False) writer.save() except: pass
def main(): """ Main function to run program """ # Set up logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) handler = logging.FileHandler(LOGGERLOCATION) handler.setLevel(logging.INFO) # create a logging format formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) # add the handlers to the logger logger.addHandler(handler) logger.info('Starting...') # I write back to the original dataframe and pandas warns about that, so turning off the warning pd.options.mode.chained_assignment = None # Import dataframe from original xls df = import_xls_to_df(DATAFILENAME, DATASHEETNAME) print(len(df)) logger.info('Raw dataframe length before any processing: ' + repr(len(df))) # Add a column for URL pinging response add_column(df, 'URL status') # Clean the dataframe df = clean_data(df, 'Year First Provided') # Get a list of rootdomains (i.e. netloc) of URLs rootdomainsdf = get_root_domains(df, 'URL') # Adds data into df about status of the URL at which software is stored url_check = check_url_status(df, 'URL', 'URL status') url_df = pd.concat([url_check['URL'], url_check['URL status']], axis=1, keys=['URL', 'URL status']) # Count the unique values in columns to get summaries of open/closed/no licence, which university released outputs, where outputs are being stored and in which year outputs were recorded open_source_licence = produce_count_and_na(df, 'Open Source?') open_source_licence.index = open_source_licence.index.fillna('No response') universities = produce_count_and_na(df, 'RO') unique_rootdomains = produce_count_and_na(rootdomainsdf, 'rootdomains') year_of_return = produce_count(df, 'Year First Provided') url_status = produce_count(df, 'URL status') # Want this to be sorted in year order rather than in order of largest count year_of_return.sort_index(inplace=True) # Collate all impact statements into a text file for later word cloud generation impact_to_txt(df, 'Impact') # Plot results and save charts plot_bar_charts(open_source_licence, 'opensource', 'Is the output under an open-source licence?', None, 'No. of outputs', 0) plot_bar_charts(universities, 'universities', 'Top 30 universities that register the most outputs', None, 'No. of outputs', 30) plot_bar_charts(unique_rootdomains, 'rootdomain', '30 most popular domains for storing outputs', None, 'No. of outputs', 30) plot_bar_charts(year_of_return, 'returnyear', 'When was output first registered?', None, 'No. of outputs', 0) # Write results to Excel spreadsheet for the shear hell of it writer = ExcelWriter(EXCEL_RESULT_STORE) open_source_licence.to_excel(writer, 'opensource') universities.to_excel(writer, 'universities') unique_rootdomains.to_excel(writer, 'rootdomain') year_of_return.to_excel(writer, 'returnyear') url_df.to_excel(writer, 'urlstatus') url_status.to_excel(writer, 'urlstatus_summ') df.to_excel(writer, 'Resulting_df') writer.save()
a = list(decomposition(df1["discounted_price"][j])) for i in range(0, 6): if (i == 0): df1["month_1"][c] = a[i] if (i == 1): df1["month_2"][c] = a[i] if (i == 2): df1["month_3"][c] = a[i] if (i == 3): df1["month_4"][c] = a[i] if (i == 4): df1["month_5"][c] = a[i] if (i == 5): df1["month_6"][c] = a[i] c = c + 1 print("ho ho ho ho") df1.insert(9, "cancel", 0) user_id = pd.Series([]) cancel = pd.Series([]) for k in range(len(df1)): if df1["user_id"][k] == '0': df1["user_id"][k] = (random.randrange(1, 150, 3)) df1["cancel"][k] = (random.randrange(0, 3, 1)) df2 = df1.groupby("user_id").sum() print(df2) writer = ExcelWriter('new.xls') df2.to_excel(writer, 'Sheet1') writer.save()
def fix_gene_protein_inconsistencies(config, hgnc_df_filename, log): # create a excel spreadsheet with the HGNC and antibody-gene-p map log.info('\tstart fixing gene/protein inconsistencies') corrected_aa_files_dir = config['protein']['corrected_aa_file_dir'] writer = ExcelWriter(corrected_aa_files_dir + 'antibody-gene-protein-map.xlsx') antibody_to_gene_map, antibody_to_protein_map = get_antibody_gene_protein_map( config, log) # create a dataframe to work with log.info('\t\tcreate combined antibody to gene/protein map') aa_map = [] for i in antibody_to_gene_map: num_gene_names = len(filter(len, antibody_to_gene_map[i])) num_protein_names = len(filter(len, antibody_to_protein_map[i])) aa_map.append({ 'composite_element_ref': i, 'gene_name': antibody_to_gene_map[i], 'protein_name': antibody_to_protein_map[i], 'num_genes': num_gene_names, 'num_proteins': num_protein_names }) data_library = pd.DataFrame(aa_map) # --------------------------part 1-------------------------------------- ## check other potential protein and gene names protein_lists = data_library['protein_name'].tolist() gene_lists = data_library['gene_name'].tolist() protein_names = [item for i in protein_lists for item in i] gene_names = [item for i in gene_lists for item in i] data_library.loc[:, 'other_protein_names'] = data_library['protein_name'].map( lambda x: get_variants_of_name(x[0], protein_names)) data_library.loc[:, 'other_gene_names'] = data_library['gene_name'].map( lambda x: get_variants_of_name(x[0], gene_names)) data_library.loc[:, 'final_curated_protein'] = data_library['protein_name'] #--------------------------part 2 ---------------------------------------- # HGNC validation hgnc_df = parse_hgnc(hgnc_df_filename, log) hgnc_df.to_excel(writer, 'HGNC_validated_genes') writer.save() # this is an hack if we find multiple genes log.info('\t\tcombine multiple genes in record') for idx, row in data_library.iterrows(): record = row.to_dict() all_val_statuses = [] all_val_genes = [] additional_notes = '' for genelist in record['gene_name']: ind_val_status = [] ind_val_gene = [] ind_val_notes = [] for gene in genelist.split(): val_status, val_gene, additional_notes = hgnc_validation( gene, hgnc_df) ind_val_status.append(val_status) ind_val_gene.append(val_gene) ind_val_notes.append(additional_notes) all_val_statuses.append(" ".join(ind_val_status)) all_val_genes.append(" ".join(ind_val_gene)) additional_notes = ";".join(list(set(ind_val_notes))) data_library.loc[idx, 'HGNC_validation_status'] = all_val_statuses data_library.loc[idx, 'final_curated_gene'] = all_val_genes additional_notes = additional_notes.strip() if additional_notes: data_library.loc[idx, 'additional_notes'] = additional_notes # -----------------Rank the dataframe----------------------------------# # rank the data frame log.info('\t\trank records') data_library = rank_dataframe(data_library) data_library = data_library.sort(['row_rank'], ascending=[1]) col_order = [ 'composite_element_ref', 'num_genes', 'num_proteins', 'gene_name', 'protein_name', 'HGNC_validation_status', 'other_protein_names', 'other_gene_names', 'final_curated_gene', 'final_curated_protein', 'row_rank', 'notes', 'additional_notes' ] data_library.to_excel(writer, 'antibody-gene-protein-map', index=False, columns=col_order) writer.save() log.info('\tdone fixing gene/protein inconsistencies')
def PostProcess(data_dir='./out', log_dir=None): if (log_dir): if not os.path.exists(log_dir): os.makedirs(log_dir) log_file = open(log_dir + "postProcess.log", "a") old_stdout = sys.stdout sys.stdout = log_file print("Scanning files ...") agg_data_df = pd.DataFrame( columns=['Name', 'URL', 'Phone', 'Address', 'Category', 'Industry']) for root, dirs, files in os.walk(data_dir): for name in files: file_nature = name.split('_')[0] if file_nature == "vendorsOf": file_name = os.path.join(root, name) print(f"Adding : {name} ..") file_df = pd.read_excel(file_name) agg_data_df = agg_data_df.append(file_df, ignore_index=True, sort=False) agg_data_df.sort_values('Name', inplace=True) agg_data_df.drop_duplicates('URL', inplace=True) print(f"Found {len(agg_data_df.index)} distinct product URLs") writer = ExcelWriter(data_dir + '/all_vendors.xlsx') d = { 'Name': agg_data_df['Name'], 'URL': agg_data_df['URL'], 'Phone': agg_data_df['Phone'], 'Address': agg_data_df['Address'], 'Category': agg_data_df['Category'], 'Industry': agg_data_df['Industry'] } del agg_data_df all_vendors_df = pd.DataFrame( d, columns=['Name', 'URL', 'Phone', 'Address', 'Category', 'Industry']) del d try: all_vendors_df.to_excel(writer, index=False, encoding='UTF-8') writer.save() writer.close() print( f"Found {len(all_vendors_df.index)} distinct vendors !\nRecap file saved as XLSX at {out_dir}/all_vendors.xlsx" ) except Exception as e: print(f"Error saving recap file as XLSX") writer.close() all_vendors_df.to_csv('./out/all_vendors.csv', index=False, sep='|', encoding='UTF-8') print(f"Recap file saved as CSV at ./out/all_vendors.csv") sample_writer = ExcelWriter(data_dir + 'sample_vendors.xlsx') sample_vendors_df = all_vendors_df.iloc[0:500, :] try: samplevendors_df.to_excel(sample_writer, index=False, encoding='UTF-8') sample_writer.save() sample_writer.close() print(f"Sample file saved as XLSX at {out_dir}/sample_vendors.xlsx") except Exception as e: print(f"Error saving sample file as XLSX") sample_writer.close() sample_vendors_df.to_csv('./out/sample_vendors.csv', index=False, sep='|', encoding='UTF-8') print(f"Sample file saved as CSV at ./out/sample_vendors.csv\nDONE") if (log_dir): sys.stdout = old_stdout log_file.close() print(f"Found {len(all_vendors_df.index)} distinct vendor URLs") print("PostProcess : DONE.") return 0
def main(self): lateAllDf, late2HDf, late2HomitDf, lateSummary, lateSummaryDep = self.lateAnalyzer( ) lateXlsx = ExcelWriter(self.op + self.thisYear + '年' + self.thisMonth + '月迟到情况统计汇总表.xlsx') lateAllDf.columns = [ '考勤号码', '姓名', '部门', '日期', '时间', '签到时间', '上班时间', '迟到时长', '迟到时长(分钟)' ] lateAllDf.to_excel(lateXlsx, '个人明细', index=False) late2HDf.columns = [ '考勤号码', '姓名', '部门', '日期', '时间', '签到时间', '上班时间', '迟到时长', '迟到时长(分钟)' ] late2HDf.to_excel(lateXlsx, '迟到2小时以内', index=False) late2HomitDf.columns = [ '考勤号码', '姓名', '部门', '日期', '时间', '签到时间', '上班时间', '迟到时长', '迟到时长(分钟)' ] late2HomitDf.to_excel(lateXlsx, '去除免责因素以外迟到', index=False) lateSummary.columns = [ '考勤号码', '姓名', '部门', '去除免责是否月均迟到超过3次', '去除免责迟到次数', '是否月均迟到2小时以内超过3次', '迟到2小时以内次数' ] lateSummary.to_excel(lateXlsx, '个人汇总', index=False) lateSummaryDep.columns = [ '部门', '人数', self.thisMonth + '月工作日数', '应出勤总天数', '去除免责总迟到人次', '去除免责月均迟到率', '去除免责月均迟到超过3次人数', '去除免责月均迟到超过3次人数占部门人数比例', '去除免责月均迟到超过3次人次', '去除免责月均迟到超过3次人次占部门迟到人次比例', '迟到2小时以内总人次', '月均2小时以内迟到率', '月均迟到2小时以内超过3次人数', '月均迟到2小时以内超过3次人数占部门人数比例', '月均迟到2小时以内超过3次人次', '月均迟到2小时以内超过3次人次占部门迟到人次比例' ] lateSummaryDep.to_excel(lateXlsx, '部门汇总', index=False) lateXlsx.save() lateXlsx.close() print('-----------' + self.thisYear + '年' + self.thisMonth + '月迟到情况统计汇总表.xlsx' + ' 已生成-----------') # extraAllDf, extraBossDf, extraDepartureDf, personExtraSummary, extraSummaryDf = self.extraAnalyzer() # extraXlsx = ExcelWriter(self.op+self.thisYear+'年'+self.thisMonth+'月加班情况统计汇总表.xlsx') # extraSummaryDf.columns = ['总人数','去除各种假日','去除全月不考勤数值','去除全月不考勤占比','17:05以内打卡数值','17:05以内打卡占比', # '17:10以内打卡数值','17:10以内打卡占比','17:15以内打卡数值','17:15以内打卡占比','平均加班30分钟以上数值','平均加班30分钟以上占比','平均加班60分钟以上数值','平均加班60分钟以上占比'] # extraSummaryDf.to_excel(extraXlsx,'汇总表', index = False) # extraDepartureDf.columns = ['考勤所属部门','部门人数','加班总时长(分钟)','出勤天数',self.thisMonth+'月工作日数','人均加班时长(分钟)','日均加班时长(分钟)','人均日均加班时长(分钟)','人均考勤率'] # extraDepartureDf.to_excel(extraXlsx,'部门汇总', index = False) # extraBossDf.columns = ['考勤号码','姓名','加班总时长(分钟)','出勤天数',self.thisMonth+'月工作日数','日均加班时长(分钟)','人均考勤率'] # extraBossDf.to_excel(extraXlsx,'管理序列汇总', index = False) # extraAllDf.columns = ['考勤号码','姓名','部门','日期','时间','签退时间','下班时间','加班时长','加班时长(分钟)'] # extraAllDf.to_excel(extraXlsx,'个人明细', index = False) # personExtraSummary.columns = ['姓名','该月平均每天加班时长(分钟)'] # personExtraSummary.to_excel(extraXlsx,'员工加班时长排序', index = False) # extraXlsx.save() # extraXlsx.close() # print ('-----------'+self.thisYear+'年'+self.thisMonth+'月加班情况统计汇总表.xlsx'+' 已生成-----------') #######################屏蔽掉missAnalyzer在wideTable中暂时缺失的问题 missAllDf, missPersionDf, missDepDf = self.missAnalyzer() missXlsx = ExcelWriter(self.op + self.thisYear + '年' + self.thisMonth + '月无考勤情况统计汇总表.xlsx') missDepDf.columns = [ '部门', '部门人数', '部门工作日无考勤人次', '人均无考勤人次', '当月累计无考勤天数超过10个工作日的人数' ] missDepDf.to_excel(missXlsx, '部门汇总', index=False) #########bug missPersionDf.columns = [ '考勤号码', '姓名', '部门', '无考勤天数', '当月累计无考勤天数是否超过10个工作日' ] missPersionDf.to_excel(missXlsx, '个人汇总', index=False) missAllDf.columns = [ '考勤号码', '姓名', '部门', '日期', '时间', '签到时间', '上班时间', '迟到时长', '加班时长(分钟)' ] missAllDf.to_excel(missXlsx, '考勤原始明细', index=False) missXlsx.save() missXlsx.close() print('-----------' + self.thisYear + '年' + self.thisMonth + '月无考勤情况统计汇总表.xlsx' + ' 已生成-----------')
def saveToExcel(df,filename,tab): writer = ExcelWriter(filename) df.to_excel(writer,tab) writer.save()
def write_to_excel(exl_list, cols): df1 = pd.DataFrame(exl_list, columns=cols) writer = ExcelWriter('static/download/URLLookupService.xlsx') df1.to_excel(writer, 'URLLookUpService', index=False) writer.save()
def create_excel(dt,name): writer = ExcelWriter(name) #creates an instance to write data dt.to_excel(writer, sheet_name='Sheet1') writer.save()
# identify 50 metro areas in our list df1['in50'] = df1.apply(metro_matching, axis=1) # Only retain rows for 50 metro areas in our list df1 = df1[df1['in50']==1] # create a column with names from our list for consistency df2 = df1.copy() df2['Metro name'] = df1['metro_name'].apply(std_name) # drop the unnecessary columns and bring the last column first df2.drop(columns=['Place','metro_name', 'state_name', 'in50'], inplace=True) cols = list(df2.columns) cols = [cols[-1]] + cols[:-1] df2 = df2[cols] return df2 # create individual dataframes and concatenate them along the index df_AL = dfCreator(HPI_AL) df_MZ = dfCreator(HPI_MZ) df_joined = pd.concat([df_AL, df_MZ], ignore_index=True) # write it in an Excel file w = ExcelWriter('House Price Index.xlsx') df_joined.to_excel(w) w.save()
copia1 = pd.DataFrame({ 'Nombre': nombres, 'Primer Apellido': apellido, 'Segundo Apellido': ['Medina', 'Montoya', 'Peña'] }) # Introduciomos la siguiente linea para que las columnas queden en un orden especifico copia1 = copia1[['Nombre', 'Primer Apellido', 'Segundo Apellido']] # Ahora creamos y guardamos el archivo. Index=False es para evitar que se cree una columna adicional con numeracion archivo = ExcelWriter('copia1.xls') copia1.to_excel(archivo, 'Hoja Copia', index=False) archivo.save() archivo.close() # EJERCICIO 2 File = pd.ExcelFile('Libro2.xls') hoja1 = File.parse('Hoja1') paises = hoja1['Paises'].values copia1 = pd.DataFrame({ 'Pais': paises, 'Capital': ['Ottawa', 'Ciudad de México', 'Bogotá', 'Madrid', 'Moscú'] }) copia1 = copia1[['Pais', 'Capital']]
def main(): # Here we are reading the raw data csv file re_style = ReportStyles() csv_file = pd.read_csv('google.csv') # Declaring list variables month = [] year = [] Stock = [] # assigning values to variable stock = csv_file['Stock'] # looping to extract months and year from date for index in csv_file.index: #assiging all the date to a variable dates = pd.DatetimeIndex(csv_file['date']) #appending years to a list year.append(dates[index].year) #appending months to a list month.append(dates[index].month) #appending stock to a list Stock.append(stock[index]) #creating a data dictonary data = {'Stock': Stock, 'Month': month, 'Year': year} #creating a pandas dataframe from dict frame = pd.DataFrame(data) #Merging two data frames into one common data frame csv_file = pd.merge(csv_file, frame, on='Stock') #pivoting the data piv = csv_file.pivot_table(['Open', 'High'], rows='Month', cols='Year', margins=True, aggfunc='count') #writing pivot table to an excel piv.to_excel('temp.xls') book = open_workbook('temp.xls') #reading the first sheet from excel sheet0 = book.sheet_by_index(0) col_cnt = sheet0.ncols row_cnt = sheet0.nrows pd1 = pd.read_excel(io='temp.xls', sheetname='Sheet1') pd2 = pd.read_excel(io='temp.xls', sheetname='Sheet1') writer = ExcelWriter('temp1.xls') pd1.to_excel(writer, 'Sheet1', startcol=0, startrow=2) pd2.to_excel(writer, 'Sheet1', startcol=(col_cnt + 2), startrow=2) writer.save() book = open_workbook('temp1.xls') #reading the first sheet from excel sheet0 = book.sheet_by_index(0) col_cnt1 = sheet0.ncols row_cnt1 = sheet0.nrows currency = XFStyle() currency.borders = re_style.borders_light() currency.alignment = re_style.align_hor_right() currency.num_format_str = "[$$-409]#,##0.00;-[$$-409]#,##0.00" headings = XFStyle() headings.borders = re_style.borders_light() headings.alignment = re_style.align_hor_center() headings.font = re_style.text_bold() no_borders = XFStyle() no_borders.borders = re_style.no_borders() wb = Workbook() ws = wb.add_sheet('Sample_Report', cell_overwrite_ok=True) for row in range(row_cnt1): for col in range(col_cnt1): val = sheet0.cell_value(row, col) if row < 2: ws.row(row).write(col, val, no_borders) elif col == (col_cnt + 2): ws.row(row).write(col, val, headings) # elif col > col_cnt and col < (col_cnt+3): # ws.row(row).write(col, val, no_borders) elif row > 4 and col > 0: ws.row(row).write(col, val, currency) elif row > 4 and col > (col_cnt + 3): ws.row(row).write(col, val, currency) else: ws.row(row).write(col, val, headings) wb.save('Report-1.xls')
def __process_zhejiang_IMEI(callFailData, path, file_pre, cs_ps): model_list_fp = open(os.path.join('.', 'config', '云诊断内销浙江统计机型列表.txt'), 'r') modelList = [] for model in model_list_fp.readlines(): modelList.append(model.strip()) xls_fileName = os.path.join(path, file_pre + '_数据分析结果_浙江IMEI' + cs_ps + '.xls') workbook = xlsxwriter.Workbook(xls_fileName) #---对每一个型号进行过滤和对比 #如果包含在写入excel表格 list_result = [] for model in modelList: model0 = model.split('_')[0] model1 = model.split('_')[1] worksheet = workbook.add_worksheet(model) worksheet.set_column('A:A', 20) before = str(callFailData.shape[0]) callFailData_after = callFailData[callFailData['外部机型'] == model0] after = str(callFailData_after.shape[0]) print('开始过滤' + model + '...' + after + '/' + before) #获取dataframe中的所有IMEI数据 imeiList_a = [] for imei in callFailData_after['imei'].tolist(): imeiList_a.append(str(imei).strip()) #获取文件中浙江的IMEI列表 imeiList_b = [] fileName = os.path.join('.', 'zhejiang_imei', model1 + '.txt') imeiFile_fp = open(fileName, 'r') imei_zhejiang = imeiFile_fp.readlines() for imei in imei_zhejiang: imeiList_b.append(imei.strip()) #获得浙江IMEI列表和dataframe IMEI中的交集 IMEI_intersection = list(set(imeiList_a).intersection(set(imeiList_b))) #print('a='+str(len(imeiList_a))+',b='+str(len(imeiList_b))+',intersection='+str(len(IMEI_intersection))) #按照dataframe的数量排序,获取浙江输出到excel callFailData_IMEI = callFailData_after['imei'].value_counts() allIMEI = callFailData_IMEI.index.tolist() row_i = 0 for imei_i in range(len(allIMEI)): for imei_filtered in IMEI_intersection: if (imei_filtered == allIMEI[imei_i]): worksheet.write(row_i, 0, imei_filtered) worksheet.write(row_i, 1, callFailData_IMEI.values[imei_i]) list_result.append( (imei_filtered, callFailData_IMEI.values[imei_i]), ) row_i += 1 #---对所有过滤出来的浙江IMEI计算Top print('ouput all...') worksheet = workbook.add_worksheet('all') worksheet.set_column('A:A', 20) mylist = sorted(list_result, key=lambda t: t[1], reverse=True) for i in range(len(mylist)): worksheet.write(i, 0, mylist[i][0]) worksheet.write(i, 1, mylist[i][1]) workbook.close() length_mylist = 0 if (len(mylist) < 1): callFailData_internal = pd.DataFrame(columns=callFailData.columns) else: if (len(mylist) < 10): length_mylist = len(mylist) else: length_mylist = 10 callFailDataList = [] for i in range(length_mylist): callFailData_internal = callFailData[callFailData['imei'] == mylist[i][0]] callFailDataList.append(callFailData_internal) callFailData_internal = pd.DataFrame(columns=callFailData.columns) for i in range(1, len(callFailDataList)): callFailData_internal = callFailData_internal.append( callFailDataList[i], ignore_index=True) xls_fileName1 = os.path.join( path, file_pre + '_数据分析结果_浙江IMEI详细信息' + cs_ps + '.xlsx') writer = ExcelWriter(xls_fileName1) callFailData_internal.to_excel(writer, 'data') writer.save()
# Podemos realizar operaciones basicas con las series tamaño2=tamaño1+5 # Ahora importamos Excel writer para poder crear un libro y hoja nueva from pandas import ExcelWriter # Guardamos ambos archivos file=ExcelWriter('Copia1.xls') tamaño0.to_excel(file,'Hoja1') tamaño1.to_excel(file,'Hoja2') tamaño2.to_excel(file,'Hoja2') file.save() File=pd.ExcelFile('Libro2.xls') hoja1=File.parse('Hoja1') paises=hoja1['Paises'].values print(paises) # Vamos a asignar una variable a cada elemento de la columna a=paises[0] b=paises[1] c=paises[2]
buggroup=bug.getElementsByTagName('BugGroup')[0].firstChild.nodeValue bugcode=bug.getElementsByTagName('BugCode')[0].firstChild.nodeValue bugmessage=bug.getElementsByTagName('BugMessage')[0].firstChild.nodeValue buildid=bug.getElementsByTagName('BugTrace')[0].getElementsByTagName('BuildId')[0].firstChild.nodeValue assessmentreportfile=bug.getElementsByTagName('BugTrace')[0].getElementsByTagName('AssessmentReportFile')[0].firstChild.nodeValue for buglocation in buglocations: locations=buglocation.getElementsByTagName('Location') for location in locations: loc_id=location.getAttribute('id') is_primary=location.getAttribute('primary') sourcefile=location.getElementsByTagName('SourceFile')[0].firstChild.nodeValue startline=location.getElementsByTagName('StartLine')[0].firstChild.nodeValue endline=location.getElementsByTagName('EndLine')[0].firstChild.nodeValue parasoftbugs.set_value(rowcount,'SourceFile',sourcefile) parasoftbugs.set_value(rowcount,'Bug Group',buggroup) parasoftbugs.set_value(rowcount,'Bug Code',bugcode) parasoftbugs.set_value(rowcount,'Bug Message',bugmessage) parasoftbugs.set_value(rowcount,'Build ID',buildid) parasoftbugs.set_value(rowcount,'AssessmentReportFile',assessmentreportfile) parasoftbugs.set_value(rowcount,'Location ID',loc_id) parasoftbugs.set_value(rowcount,'Primary',is_primary) parasoftbugs.set_value(rowcount,'StartLine',startline) parasoftbugs.set_value(rowcount,'EndLine',endline) # if location.hasElement('EndLine'): # endline=location.getElementsByTagName('EndLine')[0].firstChild.nodeValue # parasoftbugs.set_value(rowcount,'EndLine',endline) rowcount+=1 xclwrite=ExcelWriter('sql_cpp_bugs.xlsx') parasoftbugs.to_excel(xclwrite,'Sheet 1',index=True) xclwrite.save()
def generar_excel(dfautos, nombre): nombre_final = nombre + '.xlsx' writer = ExcelWriter(nombre_final) dfautos.to_excel(writer, 'muestras') writer.save() print('ok')
def generar_df_excel(df, nombre_archivo): nombre_final = nombre_archivo + '.xlsx' writer = ExcelWriter(nombre_final) df.to_excel(writer, 'Muestras') writer.save() print('OK')
def func_tt_vardescrible(train, test, train_cols, save_path, tag, file_tag: str): path_ = save_path from pandas import ExcelWriter writer = ExcelWriter(path_ + '_train_test_compare_%s.xlsx' % file_tag) train_i = train.copy() test_i = test.copy() varname_list = [] varks_list = [] ks_j_ = [] ks_i_ = [] iv_i_ = [] iv_j_ = [] check = [] group = [] psi_all = [] num = 0 for i in train_cols: print('turn to ', i) if i in train.columns and i != 'intercept': ks_i, iv_i, df_gp1 = cal_ks_tt(train_i, i, tag) ks_j, iv_j, df_gp2 = cal_ks_tt(test_i, i, tag) varname_list.append(i[:-4]) varks_list.append(abs(ks_i - ks_j)) ks_j_.append(ks_j) ks_i_.append(ks_i) iv_i_.append(iv_i) iv_j_.append(iv_j) group.append(df_gp1.shape[0]) df_gp1 = df_gp1.reset_index() df_gp2 = df_gp2.reset_index() df_gp1.index = df_gp1[i] df_gp2.index = df_gp2[i] df_describle = pd.concat([df_gp1, df_gp2], axis=1, keys=['TRAIN', 'CROSS'], sort=False) df_describle = df_describle.reset_index(drop=True) df_describle['PSI'] = ( df_describle[('TRAIN', 'pct_bin')] - df_describle[('CROSS', 'pct_bin')]) * np.log(df_describle[ ('TRAIN', 'pct_bin')] / df_describle[('CROSS', 'pct_bin')]) psi = sum([ii for ii in df_describle['PSI'] if not pd.isnull(ii)]) psi_all.append(psi) df_describle = df_describle.reset_index(drop=True) # df_describle = df_describle.sort_values(('TRAIN', 'Woe')) # 我从来没有想过会出现test不单调的情况,但是它居然出现了;没办法,只能加个判定了 # ————————————判定开始———————————— test_woe = df_describle['TEST']['Woe'].tolist() if pd.Series(test_woe).is_monotonic_decreasing or pd.Series( test_woe).is_monotonic_increasing: check.append(0) else: check.append(1) # ————————————判定结束———————————— df_describle.to_excel(writer, 'var_details', startrow=num) num += len(df_describle) + 4 test_ks = pd.DataFrame({ 'var': varname_list, 'ks_train': ks_i_, 'ks_test': ks_j_, 'ks_dif': varks_list, 'iv_train': iv_i_, 'iv_test': iv_j_, 'check': check, 'group': group, 'PSI': psi_all }) ks_sort = test_ks.sort_values('ks_test', ascending=False)[[ 'var', 'iv_train', 'iv_test', 'ks_train', 'ks_test', 'ks_dif', 'group', 'check', 'PSI' ]] ks_sort.to_excel(writer, 'summary', startrow=0) writer.save()
def f(v): if v == None: raise PreventUpdate print(v) #df contains results of minervini screen = all stocks that passed the check df = get_stocks(filePath1) #pr_table contains table previousrun df2 = pd.read_sql_table('previous', 'postgresql://*****:*****@localhost/test') date_added = dt.date.today() #print("Neuer teil - 1") #print(df) #print("Neuer Teil - 2") #print(df2) #temp1 = pd.merge(df, df2, left_on='Stock', right_on='Stock') temp = pd.merge(df, df2, on='Stock', how='left', suffixes=('_left', '_right')) print("Temp 2:") print(temp) newFile = os.path.dirname(filePath2) + "/Temp.xlsx" print(filePath2) print(newFile) writer = ExcelWriter(newFile) temp.to_excel(writer, "Sheet1") writer.save() exportListNew = pd.DataFrame(columns=['Stock', "Date added"]) #for i in temp.index: #if temp["50 Day MA_left"][i] == temp["50 Day MA_right"][i]: # print(temp["Stock"][i] + " ok") #elif temp["50 Day MA_left"][i] != temp["50 Day MA_right"][i]: # print(temp["Stock"][i] + " unequal") # print(temp["50 Day MA_right"][i]) # print(temp["50 Day MA_left"][i]) # if math.isnan(temp["50 Day MA_right"][i]): # exportListNew = exportListNew.append({'Stock': temp["Stock"][i], "Date added": date_added}, ignore_index=True) # #if temp["50 Day MA_right"][i] == "nan": # print(temp["Stock"][i] + " is right empty") #add todays date in "date added" # elif math.isnan(temp["50 Day MA_left"][i]): # print(temp["Stock"][i] + " is left empty") print(exportListNew) #compare df with existing db table previousrun (read it from db and save it as df2) #if stock from df exists in previousrun -> ok #if stock from df does not exist in previousrun -> new stock - date_added = today #if stock from previousrun is not part of new df -> stock no longer valid -> needs to be deleted from db if exportListNew.empty: print("Keine Änderungen seit gestern.") nochange = { 'Stock': ['Keine Änderung seit '], 'Date added': ['1900-01-01'] } nochangelist = pd.DataFrame(nochange) table2 = dbc.Table.from_dataframe(nochangelist, striped=True, bordered=True, responsive=True, hover=True) else: table2 = dbc.Table.from_dataframe(exportListNew, striped=True, bordered=True, responsive=True, hover=True) table1 = dbc.Table.from_dataframe(df, striped=True, bordered=True, responsive=True, hover=True) #print(df) #clear db table before saving the latest stocks try: db.session.query(Previous).delete() db.session.commit() print("Clear db committed") except: db.session.rollback() print("DB rollback scenario") ##save to db pg = pd.DataFrame(df) pg.to_sql("previous", con=db.engine, if_exists='replace', index=False) ## end save to db newFile = os.path.dirname(filePath2) + "/ScreenOutput.xlsx" print(filePath2) print(newFile) writer = ExcelWriter(newFile) df.to_excel(writer, "Sheet1") writer.save() return dbc.Row(children=[dbc.Card(table2), dbc.Card(table1)])
portfolios = ['Warrant Trading', 'CBBC Trading'] portfolio_data = { portfolio: make_portfolio_data(portfolio) for portfolio in portfolios } # In[ ]: excel_writer = ExcelWriter(output_file_path) for portfolio, (pivot, pivot_to_display, plot_pivot) in portfolio_data.items(): pivot.to_excel(excel_writer, '{} pivot'.format(portfolio)) pivot_to_display.to_excel(excel_writer, '{} pivot'.format(portfolio)) plot_pivot.to_excel(excel_writer, '{} plot_pivot'.format(portfolio)) excel_writer.save() # In[ ]: with sns.axes_style('dark', {'font.family': ['SimHei'], 'axes.grid': False}): fig, axes = plt.subplots(ncols=2) fig.set_figheight(5) fig.set_figwidth(15) for portfolio, ax in zip(portfolios, axes): _, _, plot_pivot = portfolio_data[portfolio] sns.barplot(x='Broker', y='Turnover', data=plot_pivot, ax=ax) ax.set_yticklabels([]) ax.set_ylim(100)
def extract_into_excel(): ''' Dependency contains pandas and openpyxl. If you are using python3 @ Ubuntu, then: apt install python3-pandas apt install python3-xlsxwriter ''' # TODO: this coding style is very ugly. prefix = os.getcwd().split("/")[-2] end2end_fps_file = prefix + '-end2end_fps.txt' hardware_fps_file = prefix + '-hardware_fps.txt' total_exe_time_file = prefix + '-total-exe-time.txt' final_acc_file = prefix + '-global-acc.txt' prepare_input_time_file = prefix + '-prepare_input.txt' copyin_time_file = prefix + '-copyin_time.txt' execution_time_file = prefix + '-execution_time.txt' copyout_time_file = prefix + '-copyout_time.txt' post_process_time_file = prefix + '-post_process_time.txt' sparsity_list = [] batch_size_list = [] data_parallel_list = [] model_parallel_list = [] thread_num_list = [] fifo_size_list = [] end2end_fps_list = [] hardware_fps_list = [] total_exe_time_list = [] final_acc_list = [] prepare_input_time_list = [] copyin_time_list = [] execution_time_list = [] copyout_time_list = [] post_process_time_list = [] # end to end fps file_reader = open(end2end_fps_file, 'r') try: text_lines = file_reader.readlines() #print(text_lines) for line in text_lines: sparsity, batch_size, data_parallel, model_parallel, thread_num, end2end_fps = line.split( ",") sparsity_list.append(float(sparsity)) batch_size_list.append(int(batch_size)) data_parallel_list.append(int(data_parallel)) model_parallel_list.append(int(model_parallel)) thread_num_list.append(int(thread_num)) fifo_size_list.append(2) end2end_fps_list.append(float(end2end_fps)) finally: file_reader.close() # hardware fps file_reader = open(hardware_fps_file, 'r') try: text_lines = file_reader.readlines() #print(type(text_lines)) #print(text_lines) for line in text_lines: _, hardware_fps = line.split(",") hardware_fps_list.append(float(hardware_fps)) finally: file_reader.close() # total exe time file_reader = open(total_exe_time_file, 'r') try: text_lines = file_reader.readlines() #print(type(text_lines)) #print(text_lines) for line in text_lines: _, total_exe_time = line.split(",") total_exe_time_list.append(float(total_exe_time)) finally: file_reader.close() # final top-1 accuracy file_reader = open(final_acc_file, 'r') try: text_lines = file_reader.readlines() #print(type(text_lines)) #print(text_lines) for line in text_lines: _, final_acc = line.split(",") final_acc_list.append(float(final_acc)) finally: file_reader.close() # prepare input time file_reader = open(prepare_input_time_file, 'r') try: text_lines = file_reader.readlines() #print(type(text_lines)) #print(text_lines) for line in text_lines: _, _, _, _, _, prepare_input_time = line.split(",") prepare_input_time_list.append(float(prepare_input_time)) finally: file_reader.close() # copyin time file_reader = open(copyin_time_file, 'r') try: text_lines = file_reader.readlines() #print(type(text_lines)) #print(text_lines) for line in text_lines: _, _, _, _, _, copyin_time = line.split(",") copyin_time_list.append(float(copyin_time)) finally: file_reader.close() # execution time file_reader = open(execution_time_file, 'r') try: text_lines = file_reader.readlines() #print(type(text_lines)) #print(text_lines) for line in text_lines: _, _, _, _, _, execution_time = line.split(",") execution_time_list.append(float(execution_time)) finally: file_reader.close() # copyout time file_reader = open(copyout_time_file, 'r') try: text_lines = file_reader.readlines() #print(type(text_lines)) #print(text_lines) for line in text_lines: _, _, _, _, _, copyout_time = line.split(",") copyout_time_list.append(float(copyout_time)) finally: file_reader.close() # post process time file_reader = open(post_process_time_file, 'r') try: text_lines = file_reader.readlines() #print(type(text_lines)) #print(text_lines) for line in text_lines: _, _, _, _, _, post_process_time = line.split(",") post_process_time_list.append(float(post_process_time)) finally: file_reader.close() assert len(batch_size_list) == len(data_parallel_list) and \ len(data_parallel_list) == len(model_parallel_list) and \ len(model_parallel_list) == len(thread_num_list) and \ len(thread_num_list) == len(fifo_size_list) and \ len(fifo_size_list) == len(end2end_fps_list) and \ len(end2end_fps_list) == len(hardware_fps_list) and \ len(hardware_fps_list) == len(total_exe_time_list) and \ len(total_exe_time_list) == len(prepare_input_time_list) and \ len(prepare_input_time_list) == len(copyin_time_list) and \ len(copyin_time_list) == len(execution_time_list) and \ len(execution_time_list) == len(copyout_time_list) and \ len(copyout_time_list) == len(post_process_time_list), \ " Error! Must have same records length!" ordered_dict = collections.OrderedDict() ordered_dict['sparsity'] = sparsity_list ordered_dict['batch size'] = batch_size_list ordered_dict['data parallel'] = data_parallel_list ordered_dict['model parallel'] = model_parallel_list ordered_dict['thread num'] = thread_num_list ordered_dict['fifo size'] = fifo_size_list ordered_dict['End to end FPS'] = end2end_fps_list ordered_dict['Hardware FPS'] = hardware_fps_list ordered_dict['Total execution time(ms)'] = total_exe_time_list ordered_dict['Top-1 accuracy'] = final_acc_list ordered_dict['Prepare input time(ms)'] = prepare_input_time_list ordered_dict['Copyin time(ms)'] = copyin_time_list ordered_dict['Execution time(ms)'] = execution_time_list ordered_dict['Copyout time(ms)'] = copyout_time_list ordered_dict['Post process time(ms)'] = post_process_time_list df = pd.DataFrame(ordered_dict) excel_file_name = prefix + '.xlsx' writer = ExcelWriter(excel_file_name) df.to_excel(writer, 'Sheet1', index=False) writer.save()