def train_predict(all_df, features, prod_features, str_date, cv): # all_df, 통합 데이터 # features, 학습에 사용할 변수 # prod_features, 24개 금융 변수 # str_date, 예측 결과물을 산출하는 날짜. # 2016-05-28일 경우, 훈련 데이터의 일부이며 정답을 알고 있기에 교차 검증을 의미 # 2016-06-28일 경우, 캐글에 업로드하기 위한 테스트 데이터 예측 결과물을 생성한다. # cv, 교차 검증 실행 여부 # str_date로 예측 결과물을 산출하는 날짜 지정 test_date = date_to_int(str_date) # 훈련 데이터는 test_date 이전의 모든 데이터를 사용한다. train_df = all_df[all_df.int_date < test_date] # 테스트 데이터를 통합 데이터에서 분리한다. test_df = pd.DateFrame(all_df[all_df.int_date == test_date]) # 신규 구매 고객만을 훈련 데이터로 추출한다. X = [] Y = [] for i,prod in enumerate(products): prev = prod + '_prev1' # 신규 구매 고객을 prX에 저장한다. prX = train_df[(train_df[prod] == 1) & (train_df[prev] == 0)] # prY에는 신규 구매에 대한 label 값을 저장한다. prY = np.zeros(prX.shape[0], dtype=np.int8) + i X.append(prX) Y.append(prY) XY = pd.concat(X) y = np.hstack(Y) # XY는 신규 구매 데이터만 포함한다. XY['y'] = Y # 메모리에서 변수 삭제 del train_df; del all_df # 데이터별 가중치 계산하기 위해 새로운 변수(ncodpers + fecha_dato)를 생성한다. XY['ncodepers_fecha_dato'] = XY['ncodpers'].astype(str) + XY['fecha_dato'] uniqs, counts = np.unique(XY['ncodepers_fecha_dato'], reture_counts=True) # 자연 상수(e)를 통해, count가 높은 데이터에 낮은 가중치를 준다. weights = np.exp(1/counts - 1) # 가중치를 XY 데이터에 추가한다. wdf = pd.DateFrame() wdf['ncodepers_fecha_dato'] = uniqs wdf['counts'] = counts wdf['weights'] = weights XY = XY.merge(wdf, on='ncodepers_fecha_dato')
def __init__(self, leverage, fee, test_df, cols_features, hmm_model, long_states, random_states, short_states, **kwargs): super(HmmStrategy, self).__init__( leverage=leverage, fee=fee, test_df=test_df, cols_features=cols_features, hmm_model=hmm_model, long_states=long_states, random_states=random_states, ) self.short_states = short_states self.ret = pd.DateFrame() self.signal_state_all = pd.DateFrame()
def f主机表(a设备): """适用于: 普联wdr5620""" def fe主机(): a设备.f切换模式(模式.C模式wdr5620.c设备管理) i = 0 while True: va元素 = list( self.m设备.fe查找("//*[@id='eptMngList']/div[@class='eptConC']")) if i >= len(va元素): break v元素 = va元素[i] w管理 = v元素.f查找("div/div/input[1]") w管理.f点击() w详细 = self.m设备.f查找("//*[@id='eptMngDetail']") w名称 = w详细.f查找("div/p/span/pre") v名称 = w名称.fg文本() w标题 = w详细.f查找("div/span") v网络地址s, v物理地址s, v连接方式s = w标题.fg文本().split("|") v物理地址 = 地址.S物理地址.fc字符串(v物理地址s.strip()[4:]) yield {数据表.E字段.e对端名称: v名称, 数据表.E字段.e对端物理地址: v物理地址} #结束 w主人网络 = self.m设备.f查找("//*[@id='linkedEpt_rsMenu']") w主人网络.f点击() i += 1 return pandas.DateFrame(fe主机(a设备))
def read_file_to_pdf(file, file_type): try: pdf = file_type_read_functions[file_type](file) except Exception as e: logger.error(f'Error reading file to pandas: {str(e)}') return pd.DateFrame() return pdf
def pca_map(self, umap): if self.area.isEmpty(): self.__city_and_province() else: if (self.area.name not in SuperMap.rep_areas) or (umap.get(self.area.name)): if umap.get(self.area.name): temp = ump.get(self.area.name) else: temp = SuperMap.area_city_mapper.get(self.area.name) if self.city.isEmpty() and self.city.precision == 1: if not self.area.isBelong(self.city.name) and umap.get(self.area.name) != self.city.name: self.area.reset() self.__city_and_province() else: impot logging SuperMap.rep_area_set.add(self.area.name) if self.city.isNotEmpty(): self.__city_and_province() if self.city.name.isdegit(): self.city.reset() import pandas as pd return pd.DateFrame({'Province':[self.province.name], 'City':[self.city.name], 'Area':[self.area.name]})
def on_bar(self, bar): # so this function will get called on the streaming triggers # we will handle our state in here and submit trades based on that self._bars = self._bars.append( pd.DateFrame( { 'open': bar.open, 'high': bar.high, 'low': bar.low, 'close': bar.close, 'volume': bar.volume, }, index=[bar.start])) bar_len = len(self._bars) self._logger.info( f'received bar. start = {bar.start}, close = {bar.close}, len = {bar_len}' ) if bar_len < 21: return if self._outOfMarket(): return if self._state == 'TO_BUY': if self._calc_buy_signal(): self._submit_buy()
def plot_age_curve_params(fit, output_dir): df = pd.DateFrame(fit.extract(permuted=True)["beta_age_curve"], columns=["Race_Num_Adj_%d" % i for i in range(3)]) fig = sns.pairplot(df, vars=list(df.columns), diag_kind="kde", plot_kws={"alpha": 0.1}) fn = os.path.join(output_dir, "age_curve_args.png") print "writing < %s >" % fn fig.savefig(fn, bbox_inches="tight")
def save_learner_multiple_run(names_list, master_dict, output_location): num = len(names_list) mean_v_df = pd.DateFrame({'Iteration'}) for i in num: run_stats_df = master_dict['run_stats_df'] policy_df = master_dict['policy_df'] key_metrics = pd.DataFrame(master_dict['key_metrics_dict']) return
def __init__(self, data, target, classifier, filename=None): if data is not None: self.data = data elif filename[:3] == 'csv': self.data = pd.read_csv(filename) elif filename[:3] == 'txt': self.data = pd.DateFrame(filename) else: print("Data Invalid") self.target = target self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( self.data, self.target, test_size=0.7, train_size=0.3) self.classifier = classifier
def _save_excel(self, new_good_list): if os.path.exists(GOODS_EXCEL_PATH): df = pd.read_excel(GOODS_EXCEL_PATH) df = df.append(new_good_list) else: df = pd.DateFrame(new_good_list) writer = pd.ExcelFile(GOODS_EXCEL_PATH) df.to_excel( excel_writer=writer, columns=['title', 'price', 'location', 'sales', 'comment_url'], index=False, encoding='utf-8', sheet_name='Sheet') writer.save() writer.close()
def load_db(): try: db = pd.read_pickle("DB/DB.pkl") except FileNotFoundError: db = pd.DateFrame() try: sdb = pd.read_pickle("./DB/item/items.pkl") except FileNotFoundError: db = pd.DataFrame() try: nlp_log = pd.read_pickle('nlp_log.pkl') except: nlp_log = pd.DataFrame() return db, sdb, nlp_log
def data_extraction(file_list, main_file): result = pd.read_csv('./' + main_file) df = pd.DateFrame() # file_list = [[name, type], []] for f, site_type in file_list: if site_type == 'yogiyo': df = yogiyo_data_extraction(f) elif site_type == 'mangoplate': df = mangoplate_data_extraction(f) elif site_type == 'tripadvisor': df = tripadvisor_date_extraction(f) elif site_type == 'diningcode': df = diningcode_date_extraction(f) elif site_type == 'menupan': df = menupan_data_extraction(f) result = pd.concat([result, df]) result.to_csv('total_review.csv') merge_same_restaurant('total_review.csv')
def read_data(self, path, flag): samples = pd.DataFrame(columns=['基站名称', '告警数量', '曾经退服', 'label']) delta_time = np.timedelta64(1, 'D') for csvs in tqdm(os.listdir(path)): # 先可以每个基站分开做。之后再考虑是否可以合起来利用 data = pd.read_csv(os.path.join(path, csvs)) n_rows = len(data) if n_rows <= 1: continue data = pd.to_datetime(data['告警开始时间'], format='%Y-%m-%d %H:%M:%S') # 有几条告警,按比例出样本 n_samples = int(n_rows * self.sample_rate) for n in range(n_samples): rand = random.randint(1, len(data) - 1) pre = rand - 1 n_warnings = 0 was_out_service = 0 while pre >= 0 and (data['告警开始时间'][rand] - data['告警开始时间'][pre] <= delta_time): if data['告警名称'][pre] in ['网元连接中断', '小区不可用告警']: was_out_service = 1 n_warnings += 1 pre -= 1 if data['告警名称'][rand] in ['网元连接中断', '小区不可用告警']: label = 1 else: label = 0 samples.append(pd.DateFrame( [ data['告警开始时间'][rand]['基站名称'], n_warnings, was_out_service, label ], columns=['基站名称', '告警数量', '曾经退服', 'label']), ignore_index=True) return samples
def _save_events(self): """ Write all events in event_db to calendar_db.csv. """ out_data = [] out_cols = ['NAME', 'DATE', 'TIME', 'DURATION', 'LOCATION', 'DESCRIPTION'] for year in event_db.keys(): for month in event_db[year].keys(): for day in event_db[year + '.' + month].keys(): for event in event_db[year + '.' + month + '.' + day]: out_data.append([event.name, event.date, event.time, event.duration, event.location, event.description]) out_df = pd.DateFrame(data=out_data, columns=out_cols) out_df.to_csv('calendar_db.csv') return
def aggregate(self): ''' Returns a new DataFrame where equal symbols are grouped into one record, and mean/median/mode/total bid/ask price is provided as well as the same statistics for volume (bid/ask size). IQR is reported as a tuple as well. Min(high), max(low), max-min/min, open-close/close, total outstanding. Should this basically be a .featurize() for all stocks? ''' assert self.process aggr_df = pd.DateFrame() self.stocks = self.df['Symbol_Root'].unique() for stock in self.stocks: # Do aggregation stock_rows = self.df.loc[self.df['Symbol_Root'] == stock] # continue # Some columns may be irrelevant depending on type of file used return None
def random_stock_data(environ, asset_db_writer, minute_bar_writer, daily_bar_writer, adjustment_writer, calendar, start_session, end_session, cache, show_progress, output_dir): # Get list of files from path # Slicing off teh last part # 'example.csv'[:-4] = 'example' symbols = [f[:-4] for f in listdir(path)] if not symbols: raise ValueError("No symbols foound in the folder") # Prepare an empty DataFrame for dividends divs = pd.DataFrame(columns=[ 'sid', 'amount', 'ex_date', 'record_date', 'declared_date', 'pay_date' ]) # Prepare an empty DataFrame for splits splits = pd.DateFrame(columns=['sid', 'ratio', 'effective_date']) # Prepare an empty DataFrame for metadata metadata = pd.DataFrame(columns=[ 'start_date', 'end_date', 'auto_close_date', 'symbol', 'exchange' ]) # Check valid trading dates, according to the selected exchange calendar sessions = calendar.sessions_in_range(start_session, end_session) # Get data for all stocks and wrtite to Zipline daily_bar_writer.write(process_stocks(symbols, sessions, metadata, divs)) # Write the metadata asset_db_writer.write(equities=metadata) # Write splits and dividends adjustments_writer.write(splits=splits, dividends=divs) """Generator function to iterate stocks,
def test(model, test_loader, classnum=5, cvmodeoutput=False): model.eval() test_loss = 0 correct = 0 total = 0 target_num = torch.zeros((1, classnum)) predict_num = torch.zeros((1, classnum)) acc_num = torch.zeros((1, classnum)) for data, target in test_loader: with torch.no_grad(): data, target = Variable(data), Variable(target) data = data.cuda() target = target.cuda() output = model(data) # calculate the sum of loss for testset test_loss += F.nll_loss(output, target).data.item() # max means the prediction pred = output.data.max(1, keepdim=True)[1] correct += pred.eq(target.data.view_as(pred)).sum() _, predicted = torch.max(output.data, 1) pre_mask = torch.zeros(output.size()).scatter_(1, predicted.view(-1, 1), 1.) predict_num += pre_mask.sum(0) tar_mask = torch.zeros(output.size()).scatter_(1, target.data.view(-1, 1), 1.) target_num += tar_mask.sum(0) acc_mask = pre_mask * tar_mask acc_num += acc_mask.sum(0) if not cvmodeoutput: recall = acc_num / target_num precision = acc_num / predict_num F1 = 2 * recall * precision / (recall + precision) accuracy = acc_num.sum(1) / target_num.sum(1) recall = (recall.cpu().numpy()[0] * 100).round(3) precision = (precision.cpu().numpy()[0] * 100).round(3) F1 = (F1.cpu().numpy()[0] * 100).round(3) accuracy = (accuracy.cpu().numpy()[0] * 100).round(3) print('recall', " ".join('%s' % id for id in recall)) print('precision', " ".join('%s' % id for id in precision)) print('F1', " ".join('%s' % id for id in F1)) print('accuracy', accuracy) test_loss /= len(test_loader.dataset) # the output is like Test set: Average loss: 0.0163, Accuracy: 6698/10000 (67%) print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'. format(test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset))) # record the correct predict type pred = pred.cpu().numpy() pred = pd.DateFrame(pred) target.data.cpu().numpy() target = pd.DataFrame(target) correctpred = pred.loc[pred == target] print(correctpred.count()) return 100. * correct / len(test_loader.dataset)
import pandas as pd a = range(10) b = [item * 2 for item in a] data = pd.DateFrame({"idx": a, "value": b}) data.to_csv("result.csv", sep=",")
df = pd.read_csv("../raw/quaterfinal_gy_cmp_training_traveltime.txt", delimiter=";", dtype={"link_ID": object}) # 探索性数据分析(EDA) # 特征变换 df["travel_time"] = np.log1p(df["travel_time"]) # 数据平滑 def quantile_clip(group): # group.plot() group[group < group.quantile(0.05)] = group.quantile(0.05) group[group > group.quantile(0.95)] = group.quantile(0.95) # group.plot() plt.show() return df["travel_time"] = df.groupby(["link_ID", "date" ])["travel_time"].transform(quantile_clip) # 缺失值补全 date_range = pd.date_range("2016-07-01 00:00:00", "2016-07-31 00:00:00", freq = "2min") \ .append(pd.date_range("2017-04-01 00:00:00", "2016-07-31 00:00:00", freq = "2min")) new_index = pd.MultiIndex.from_product( [link_df["link_ID"].unique(), date_range], names=["link_ID", "time_interval_begin"]) df1 = pd.DateFrame(index=new_index).reset_index() df3 = pd.merge(df1, df, on=["link_ID", "time_interval_begin"], how="left")
import pandas as pd hdt = pd.read_csv('DeloitteWeekly_v3_LJS_04182021.csv') yolo_hdt = hdt[hdt['County']=='Yolo'] yolo_hdt pd.PeriodIndex(yolo_hdt['ResultDate'], freq='W') yolo_hdt['week'] = pd.PeriodIndex(yolo_hdt['ResultDate'], freq='W') yolo_hdt.query('Result=="Detected"').groupby('week').sum() yolo_hdt.query('Result=="Detected"').groupby('week').count() yolo_hdt.query('Result=="Detected"').groupby('week').count()['Row'] pos = yolo_hdt.query('Result=="Detected"').groupby('week').count()['Row'] neg = yolo_hdt.query('Result=="Not Detected"').groupby('week').count()['Row'] pos neg pos.join(neg) pd.DateFrame({'pos result':pos, 'neg result':neg}) pd.DataFrame({'pos result':pos, 'neg result':neg}) pd.DataFrame({'pos result':pos, 'neg result':neg})['2020-12-1':'2021-2-1'] pd.DataFrame({'pos result':pos, 'neg result':neg})['2020-12-1':'2021-2-1'].to_csv('dec_jan_hdt.csv') cdph = pd.read_csv('CDPH_testing_data_4_24.xlsx') import pandas as pd cdph = pd.read_csv('CDPH_testing_data_4_24.xlsx') cdph = pd.read_excel('CDPH_testing_data_4_24.xlsx') cdph cdph['lab_result_date] yolo = cdph['county'].str.lower() == 'yolo' yolo = cdph[cdph['county'].str.lower() == 'yolo'] yolo yolo.query('lab_result_date == "2020-12-15"') yolo['week'] = pd.PeriodIndex(yolo['lab_result_date'], freq='W') yolo.groupby('week').sum()
def targetstock(means, data, targetstock, purdic): #每日目标股票 if means == '1': for i in range(len(data)): if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] > 0) & (data.iloc[i, [7]][0] > 0): targetstock = targetstock.append(df.iloc[i]) if means == '2': for i in range(len(data)): if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] > 0) & (data.iloc[i, [7]][0] > 0): targetstock = targetstock.append(df.iloc[i]) if means == '3': for i in range(len(data)): if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] < 0) & (data.iloc[i, [7]][0] > 0): targetstock = targetstock.append(df.iloc[i]) if means == '4': for i in range(len(data)): if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] > 0) & (data.iloc[i, [7]][0] < 0): targetstock = targetstock.append(df.iloc[i]) if means == '5': for i in range(len(data)): if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] < 0) & (data.iloc[i, [7]][0] < 0): targetstock = targetstock.append(df.iloc[i]) if means == '6': for i in range(len(data)): if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] > 0) & (data.iloc[i, [7]][0] < 0): targetstock = targetstock.append(df.iloc[i]) if means == '7': for i in range(len(data)): if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] < 0) & (data.iloc[i, [7]][0] > 0): targetstock = targetstock.append(df.iloc[i]) if means == '8': for i in range(len(data)): if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] < 0) & (data.iloc[i, [7]][0] < 0): targetstock = targetstock.append(df.iloc[i]) if means == '9': #5+6 tem = pd.DateFrame() for i in range(len(data)): if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] < 0) & ( data.iloc[i, [7]][0] < 0): #5 tem.append(df.iloc[i]) if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] > 0) & ( data.iloc[i, [7]][0] < 0): #6 targetstock = targetstock.append(df.iloc[i]) tem.sort_values(by=['dif'], ascending=False) for i in range(min(len(tem), 10)): targetstock = targetstock.append(tem.iloc[i]) if means == '0': #4+6 tem = pd.DataFrame() for i in range(len(data)): if (data.iloc[i, [4]][0] > 0) & (data.iloc[i, [6]][0] > 0) & ( data.iloc[i, [7]][0] < 0): #4 tem = tem.append(df.iloc[i]) if (data.iloc[i, [4]][0] < 0) & (data.iloc[i, [6]][0] > 0) & ( data.iloc[i, [7]][0] < 0): #6 targetstock = targetstock.append(df.iloc[i]) ''' if len(targetstock) < 8: num = 8 - len(targetstock) if len(tem)!=0 : tem.sort_values(by = 'dif',ascending = True) for i in range(min(len(tem),num)): targetstock = targetstock.append(tem.iloc[i]) ''' for i in range(len(tem)): targetstock = targetstock.append(tem.iloc[i]) return targetstock
def train(self): #====================================== Training ===========================================# #===========================================================================================# unet_path = os.path.join( self.model_path, '%s-%d-%.4f-%d-%.4f.pkl' % (self.model_type, self.num_epochs, self.lr, self.num_epochs_decay, self.augmentation_prob)) train_history_path = os.path.join( self.model_path, 'train-%s-%d-%.4f-%d-%.4f.csv' % (self.model_type, self.num_epochs, self.lr, self.num_epochs_decay, self.augmentation_prob)) valid_history_path = os.path.join( self.model_path, 'valid-%s-%d-%.4f-%d-%.4f.csv' % (self.model_type, self.num_epochs, self.lr, self.num_epochs_decay, self.augmentation_prob)) if os.path.isfile(unet_path): self.unet.load_state_dict(torch.load(unet_path)) print('%s is Successfully Loaded from %s' % (self.model_type, unet_path)) else: lr = self.lr best_unet_score = 0. train_history = [] valid_history = [] for epoch in range(self.num_epochs): self.unet.train(True) epoch_loss = 0 acc = 0. # Accuracy SE = 0. # Sensitivity (Recall) SP = 0. # Specificity PC = 0. # Precision F1 = 0. # F1 Score JS = 0. # Jaccard Similarity DC = 0. # Dice Coefficient length = 0 for i, (images, GT) in enumerate(self.train_loader): images = images.to(self.device) GT = GT.to(self.device) SR = self.unet(images) SR_probs = F.sigmoid(SR) SR_flat = SR_probs.view(SR_probs.size(0), -1) GT_flat = GT.view(GT.size(0), -1) loss = self.criterion(SR_flat, GT_flat) epoch_loss += loss.item() self.reset_grad() loss.backward() self.optimizer.step() acc += get_accuracy(SR, GT) SE += get_sensitivity(SR, GT) SP += get_specificity(SR, GT) PC += get_precision(SR, GT) F1 += get_F1(SR, GT) JS += get_JS(SR, GT) DC += get_DC(SR, GT) length += images.size(0) acc = acc / length SE = SE / length SP = SP / length PC = PC / length F1 = F1 / length JS = JS / length DC = DC / length print('Epoch [%d/%d], Loss: %.4f, \n[Training] Acc: %.4f, SE: %.4f, SP: %.4f, PC: %.4f, F1: %.4f, JS: %.4f, DC: %.4f' % ( epoch+1, self.num_epochs, \ epoch_loss,\ acc, SE, SP, PC, F1, JS, DC)) train_history.append([acc, SE, SP, PC, F1, JS, DC]) if (epoch + 1) > (self.num_epochs - self.num_epochs_decay): lr -= (self.lr / float(self.num_epochs_decay)) for param_group in self.optimizer.param_groups: param_group['lr'] = lr print('Decay learning rate to lr: {}.'.format(lr)) #===================================== Validation ====================================# self.unet.train(False) self.unet.eval() acc = 0. # Accuracy SE = 0. # Sensitivity (Recall) SP = 0. # Specificity PC = 0. # Precision F1 = 0. # F1 Score JS = 0. # Jaccard Similarity DC = 0. # Dice Coefficient length = 0 for i, (images, GT) in enumerate(self.valid_loader): images = images.to(self.device) GT = GT.to(self.device) SR = F.sigmoid(self.unet(images)) acc += get_accuracy(SR, GT) SE += get_sensitivity(SR, GT) SP += get_specificity(SR, GT) PC += get_precision(SR, GT) F1 += get_F1(SR, GT) JS += get_JS(SR, GT) DC += get_DC(SR, GT) length += images.size(0) acc = acc / length SE = SE / length SP = SP / length PC = PC / length F1 = F1 / length JS = JS / length DC = DC / length unet_score = JS + DC print( '[Validation] Acc: %.4f, SE: %.4f, SP: %.4f, PC: %.4f, F1: %.4f, JS: %.4f, DC: %.4f' % (acc, SE, SP, PC, F1, JS, DC)) valid_history.append([acc, SE, SP, PC, F1, JS, DC]) ''' torchvision.utils.save_image(images.data.cpu(), os.path.join(self.result_path, '%s_valid_%d_image.png'%(self.model_type,epoch+1))) torchvision.utils.save_image(SR.data.cpu(), os.path.join(self.result_path, '%s_valid_%d_SR.png'%(self.model_type,epoch+1))) torchvision.utils.save_image(GT.data.cpu(), os.path.join(self.result_path, '%s_valid_%d_GT.png'%(self.model_type,epoch+1))) ''' if unet_score > best_unet_score: best_unet_score = unet_score best_epoch = epoch best_unet = self.unet.state_dict() print('Best %s model score : %.4f' % (self.model_type, best_unet_score)) torch.save(best_unet, unet_path) train_history = pd.DateFrame( train_history, columns=['acc', 'SE', 'SP', 'PC', 'F1', 'JS', 'DC']) valid_history = pd.DateFrame( valid_history, columns=['acc', 'SE', 'SP', 'PC', 'F1', 'JS', 'DC']) train_history.to_csv(train_history_path) valid_history.to_csv(valid_history_path) #===================================== Test ====================================# del self.unet del best_unet self.build_model() self.unet.load_state_dict(torch.load(unet_path)) self.unet.train(False) self.unet.eval() acc = 0. # Accuracy SE = 0. # Sensitivity (Recall) SP = 0. # Specificity PC = 0. # Precision F1 = 0. # F1 Score JS = 0. # Jaccard Similarity DC = 0. # Dice Coefficient length = 0 for i, (images, GT) in enumerate(self.valid_loader): images = images.to(self.device) GT = GT.to(self.device) SR = F.sigmoid(self.unet(images)) acc += get_accuracy(SR, GT) SE += get_sensitivity(SR, GT) SP += get_specificity(SR, GT) PC += get_precision(SR, GT) F1 += get_F1(SR, GT) JS += get_JS(SR, GT) DC += get_DC(SR, GT) length += images.size(0) acc = acc / length SE = SE / length SP = SP / length PC = PC / length F1 = F1 / length JS = JS / length DC = DC / length unet_score = JS + DC f = open(os.path.join(self.result_path, 'result.csv'), 'a', encoding='utf-8', newline='') wr = csv.writer(f) wr.writerow([ self.model_type, acc, SE, SP, PC, F1, JS, DC, self.lr, best_epoch, self.num_epochs, self.num_epochs_decay, self.augmentation_prob ]) f.close()
def __init__(self): pd = pandas.DateFrame({'id': [1, 2,3], 'name': ['jack', 'nancy']}) excel = pd.to_excel('report.xlsx')
_sorted.iloc[0] # {'a': 3, 'b': 5} ( top item) _sorted.index[0] # 2 (index of top item) # filter by content df = pd.DataFrame({'foo':[1,2,3,4,5,6], 'bar':[9,8,7,6,5,4]}) grouped = df.groupby('foo') grouped.filter(lambda i: i['foo'] > 3) # filter by labels (not on contents) df = pd.DataFrame([[1,2,3], [4,5,6]], index=['mouse','rabbit'], columns=['one','two','three']) df.filter(items=['one', 'three']) df.filter(regex='e$', axis=1) df.filter(like='bbi', axis=0) # drop df = pd.DateFrame([ [1,2], [3,4], [5,6], [7,8] ]) df = df.drop([2,3]) df # [ [1,2], [3,4] ] df = pd.DataFrame({'a': [1,2,3,4], 'b':[5,6,7,8], 'c':[9,10,11,12]}) df.drop(columns='c') df # { 'a': [1,2,3,4], 'b':[5,6,7,8] } # to csv pd.DataFrame([ [1,2], [3,4], [5,6] ]).to_csv(index=False, header=None) # '1,2\r\n3,4\r\n5,6\r\n' pd.DataFrame([ [1,2], [3,4], [5,6] ]).to_csv('myfile.csv', index=False, header=None) pd.DataFrame([ [1,2], [3,4], [5,6] ]).to_csv('myfile.txt', sep='\t', index=False) # count of csv rows df = pd.DataFrame([ [1,2], [3,4], [5,6] ]) df.shape[0] # 3
ulli_total = [] company_temp = [] company = [] num = [] data = [] for c in range(0, len(li_code) - 1): ulli_total.append(li_code[c].get_text()) # print(ulli_total) for a in range(0, len(ulli_total) - 1): company_temp.append(ulli_total[a].split('), ')) company.append(company_temp[a][0] + ')') num.append(company_temp[a][0]) data.append([company, num]) print(data) df = pd.DateFrame(data, columns=["company", "양"]) df.to_csv('crawling.csv', encoding="utf-8") # continent =[] # country=[] # print(headline_array) # num_headline=0 # while(1): # if headline_array[num_headline].get_text()=='Africa': # num_country=0 # num_country=num_headline+1 # while(1): # if headline_array[num_country].get_text()=='Algeria': # continent.append(headline_array[num_headline].get_text()) # country.append(headline_array[num_country].get_text())
("enc", OrdinalEncoder(handle_unknown = ‘ignore’)) ]) categorical_pipeline = Pipeline([ ("imp", SimpleImputer(strategy= "most_frequent")), ("enc", OneHotEncoder(sparse=True, handle_unknown = ‘ignore’)) ]) pre_pipe = ColumnTransformer([ ("cat_pre", categorical_pipeline, categorical_features), ("ord_pre", ordinal_pipeline, ordinal_features), ("num_pre", numerical_pipeline, numerical_features) ]) model = sklearn. full_pipe = Pipeline([ ("pre", pre_pipe), ("model", model) ]) full_pipe.fit(train_x, train_y) score = full_pipe.score(test_x, test_y) y_hat = full_pipe.fit(test_x) submission = pd.DateFrame() submission.loc[:, "Id"] = df_test.loc[:, "Id"] submission.loc[:, "SalePrice"] = y_hat submission.write_csv("submission.csv")
# In[24]: import numpy as np a1 = pd.Series([1, 2, 3, 4, 5, (np.nan)]) print(a1) # In[14]: dates = pd.date_range("20190601", periods=5) print(dates) # ## # In[15]: a2 = pd.DateFrame(np.random.randn(6, 4), index=dates, coloumns=list) # In[23]: a2 = pd.DataFrame({ 'A': 1., 'B': pd.Timestamp('20190601'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), 'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(["test", "train", "test", "train"]), 'F': "foo" }) print(a2) # In[27]:
#_*_ coding:utf-8 _*_ import pandas as pd import matplotlib as plt import seaborn as sns #数据包含四项,创建时间,课程名称,学习人数,学习时间,用逗号分隔,用pandas读取出来成为DataFrame的格式 courses=pd.read_table('courses.txt',sep=',',header=0) #用to_datetime将创建时间提取出来,用作新表的索引,用旧表数据,但是会多出一行创建时间,所以要删除 i=pd.to_datetime(courses['创建时间']) courses_ts=pd.DateFrame(data=courses.values,columns=courses.columns,index=i) courses_ts=courses_ts.drop('创建时间',axis=1) #对数据进行降频处理,用周次,并进行求和 courses_ts_W=courses_ts.resample('W').sum() #用matplotlib绘制图形用一个函数 def mat_figure(): plt.plot_date(courses_ts_W.index,courses_ts_W['学习时间'],'-') plt.xlabel('Time Series') plt.ylabel('Study Time') plt.show() #用matplotlib绘制的图形比较乱,不能清楚反映趋势 #用seaborn绘制 def sea_figure(): #引入一个序数列,方便绘制散点图 courses_ts_W['id']=range(0,len(courses_ts_W.index.values)) #这里用到了seaborn的绘图方式regplot,首先写入xy的参数,指定数据来源,设置散点图的参数,禁用置信区间绘制 sns.regplot('id','学习时间',data=courses_ts_W,scatter_kws={'s':10},order=5,ci=None,truncate=True)
print('# ...GENERATING TIMESERIES ON THE DATASET... #') shops_li = df.Shop_id.unique() items_li = df.Item_id.unique() #cols = ['Item_Category_Name', 'Item_Price', 'Item_Cnt_Day'] with open('data/unique_months.json') as json_file: unique_months_dict = json.load(json_file) raw_data_final = [] with tqdm(total=12) as pbar_files: for element in range(1, 13): train_final = [] with tqdm(total=len(shops_li)) as pbar_shops: for shop in shops_li: with tqdm(total=len(items_li)) as pbar_items: for item in items_li: key = '{};{}'.format(item, shop) if key in unique_months_dict: train_final = getTimeSeriesDataSet( df, shop, item, element, train_final) pbar_items.update(1) pbar_shops.update(1) pbar_files.update(1) print('# ...WRITTING FILE OF SERIE {}... # '.format(element)) df_final = pd.DateFrame(train_final) df_final.to_csv('train_final_{}_series.csv'.format(element), sep=';', index=False)
def write_out_genome_coverage(ncbi_genomes_totals, genomic_accession_dict, time_stamp, args): """Write out the genome coverage of NCBI GenBank database by the local CAZyme database :param ncbi_genomes_totals: dict {kingdom: number of NCBI GenBank genomes :param genomic_accession_dict: dict {kingdom: {genus: {species: {accession: {proteins: set(), counts: int}}}} :param time_stamp: str, date and time script was invoked :param args: cmd-line args parser Return nothing """ column_names = [ 'Kingdom', 'NCBI_genomes', 'CAZy_genomes', 'Coverage_percent' ] coverage_df = pd.DataFrame(columns=column_names) graph_columns = ['Kingdom', 'NCBI', 'CAZy'] graph_df = pd.DateFrame(columns=graph_columns) for kingdom in KINGDOMS: ncbi = ncbi_genomes_totals[kingdom] cazy = 0 genera = genomic_accession_dict[kingdom] for genus in genera: organisms = genera[genus] for species in organisms: species_genome_accessions = len(list( organisms[species].keys())) cazy += species_genome_accessions coverage = (cazy / ncbi) * 100 row_data = [kingdom, ncbi, cazy, coverage] new_row = pd.DataFrame([row_data], columns=column_names) coverage_df = coverage_df.append(new_row) row_data = [kingdom, int(ncbi), int(cazy)] new_row = pd.DataFrame([row_data], columns=graph_columns) graph_df = graph_df.append(new_row) output_path = args.output_dir / f"cazy_genbank_genome_coverage_{time_stamp}.csv" coverage_df.to_csv(output_path) fig, ax = plt.subplots() # plot CAZy bars ax.bar( graph_df['Kingdom'], graph_df['CAZy'], label='CAZy', color='orange', ) # add NCBI bars (the higher bars) ax.bar( graph_df['Kingdom'], graph_df['NCBI'], bottom=graph_df['CAZy'], label='NCBI', color='dodgerblue', ) ax.set_ylabel('Kingdom') ax.set_xlabel('Number of genomes in the database') ax.set_title('GenBank genomes included in CAZy') ax.legend() output_path = args.output_dir / f"gbk_cazy_genomes_plot_{time_stamp}.png" fig.savefig(output_path, bbox_inches='tight', dpi=360) return