def metric_by_test_size(df: pd.DataFrame, test_sizes=range(1, 10), train_size: int = 5, date=datetime.datetime.now().strftime("%m%Y")): """ fix train size and adjust test size :param df: pd.DataFrame :param test_sizes: list of test sizes :param train_size: 5 -> 5 days (one week, no weekend) :param date: research date """ # split dataset to train, test time_series = df["date"].drop_duplicates().tolist() train_test = map( lambda test_size: split_time_series( str_d=time_series[0], mid_d=time_series[train_size], end_d=time_series[train_size + test_size], df=df), test_sizes) # scoring param = S3Manager(bucket_name="production-bobsim").load_dump( key="food_material_price_predict_model/research/tuned_params.pkl") scores = map(lambda x: scoring(x, param=param), train_test) ser = pd.Series(scores, index=test_sizes) # plot series_plot(ser=ser, kind="bar", x_label="test size", y_label="customized RMSE", title="train_size: {}".format(train_size)) # save S3Manager(bucket_name="production-bobsim").save_plt_to_png( key="food_material_price_predict_model/research/{date}/image/metric_by_test_size_train{train}.png".format( date=date, train=train_size))
def metric_by_other_term(df: pd.DataFrame, train_size: int, test_size: int, n_days=range(10), date=datetime.datetime.now().strftime("%m%Y")): """ measure the metric by pushing the term of train/test data set by day :param df: dataset :param train_size: term of train data :param test_size: term of test data :param n_days: how many days do you push aside. :param date: research date """ # split dataset to train, test time_series = df["date"].drop_duplicates().tolist() train_test = map( lambda x: split_time_series( str_d=time_series[x], mid_d=time_series[x + train_size], end_d=time_series[x + train_size + test_size], df=df), n_days) # scoring param = S3Manager(bucket_name="production-bobsim").load_dump( key="food_material_price_predict_model/research/tuned_params.pkl") scores = map(lambda x: scoring(x, param=param), train_test) ser = pd.Series(scores, index=n_days) # plot series_plot(ser=ser, kind="bar", x_label="day", y_label="customized RMSE", title="train, test size: {}, {}".format(train_size, test_size), d=0.3) # save S3Manager(bucket_name="production-bobsim").save_plt_to_png( key="food_material_price_predict_model/research/{date}/image/metric_by_other_term_train{train}/test{test}.png".format( date=date, train=train_size, test=test_size))
def tuned_process(self, dataset): """ tuned ElasticNet for production :param dataset: merged 3 dataset (raw material price, terrestrial weather, marine weather) :return: metric (customized rmse) """ train_x, train_y, test_x, test_y = dataset # init model & fit model = ElasticNetModel( bucket_name=self.bucket_name, x_train=train_x, y_train=train_y, params=S3Manager(bucket_name=self.bucket_name).load_dump( key="food_material_price_predict_model/research/tuned_params.pkl" ) ) model.fit() # adjust intercept for conservative prediction model.model.intercept_ = model.model.intercept_ + 150 # predict & metric pred_y = model.predict(X=test_x) # r_test, r_pred = inverse_price(test_y), inverse_price(pred_y) metric = model.estimate_metric(scorer=customized_rmse, y=test_y, predictions=pred_y) # save # TODO self.now -> date set term, e.g. 010420 - 120420 model.save(prefix="food_material_price_predict_model/{term}".format(term=self.term)) return metric
def main(): """ standalone process to load and save CSV file data with AWS S3 """ manager = S3Manager(bucket_name="production-bobsim", ) objs = manager.fetch_objects(key="crawled_recipe", conversion_type="json") print(objs)
def __init__( self, x_train, y_train, bucket_name, grid_params=None, score=mean_squared_error ): if grid_params is None: grid_params = { "max_iter": [1, 5, 10], "alpha": [0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100], "l1_ratio": np.arange(0.0, 1.0, 0.1) } self.x_train = x_train self.y_train = y_train self.scorer = score self.error = None # pd.Series self.metric = None # s3 self.s3_manager = S3Manager(bucket_name=bucket_name) # logger self.logger = init_logger() super().__init__( estimator=ElasticNet(), param_grid=grid_params, scoring=make_scorer(self.scorer, greater_is_better=False), # we have to know the relationship before and after obviously, so n_splits: 2 cv=TimeSeriesSplit(n_splits=2).split(self.x_train) )
def load_from_s3(bucket_name, key): with tempfile.TemporaryFile() as fp: S3Manager(bucket_name=bucket_name).s3_bucket.download_fileobj( Fileobj=fp, Key=key) fp.seek(0) transformer = load(fp) fp.close() return transformer
def load(self): """ fetch DataFrame and astype and filter by columns :return: pd DataFrame """ manager = S3Manager(bucket_name=self.bucket_name) df = manager.fetch_df_from_csv(key=self.load_key) # TODO: no use index to get first element. return df[0]
def load(self): """ init S3Manager instances and fetch objects :return: list of pd DataFrame (origin) """ manager = S3Manager(bucket_name=self.bucket_name) df_list = manager.fetch_df_from_csv(key=self.s3_key) self.logger.info("{num} files is loaded".format(num=len(df_list))) self.logger.info("load df from origin bucket") return df_list
def load(self): """ fetch DataFrame and astype and filter by columns :return: pd DataFrame """ manager = S3Manager(bucket_name=self.bucket_name) df = manager.fetch_df_from_csv(key=self.load_key) # TODO: no use index to get first element. # filter by column and check types return df[0][self.dtypes.keys()].astype(dtype=self.dtypes).rename( columns=self.translate, inplace=False)
def load(filename="2014-2020"): """ fetch DataFrame and astype and filter by columns :return: pd DataFrame """ manager = S3Manager(bucket_name="production-bobsim") df = manager.fetch_df_from_csv( key="public_data/open_data_terrestrial_weather/origin/csv/{filename}.csv" .format(filename=filename)) # TODO: no use index to get first element. return df[0]
def call_price(self): # args_str = "" # # for k, v in self.args.items(): # args_str += '%s=%s' % (k, v) # # res = requests.get( # 'http://211.237.50.150:7080/openapi/7c785c42110451cba1eeb8b572111a4c48b98cba8d49c92fdb801607727df47c/json/Grid_20151128000000000315_1/1/5?{arg}'.format(arg=args_str)) # print('http://211.237.50.150:7080/openapi/7c785c42110451cba1eeb8b572111a4c48b98cba8d49c92fdb801607727df47c/json/Grid_20151128000000000315_1/1/5?{arg}'.format(arg=args_str)) # print(args_str) # data = res.json() # items = data["Grid_20151128000000000315_1"]["row"] # print(items) # df_items = pd.DataFrame(items) # df_items.drop(["ROW_NUM"], axis=1, inplace=True) # print(df_items) # TODO: remove # # load_key = "public_data/open_data_raw_material_price/origin/csv/{filename}.csv".format(filename="202005040") #self.today) # manager = S3Manager(bucket_name="production-bobsim") # manager.save_df_to_csv(df=df_items, key=load_key) def func(x: int): a = { 'EXAMIN_DE' : "20200504", #self.today, "&EXAMIN_PRDLST_CODE": x } args_str = "" for k, v in a.items(): args_str += '%s=%s' % (k, v) res = requests.get('http://211.237.50.150:7080/openapi/7c785c42110451cba1eeb8b572111a4c48b98cba8d49c92fdb801607727df47c/json/Grid_20151128000000000315_1/1/5?{arg}'.format(arg=args_str)) data = res.json() print(data) # if len(data['Grid_20151128000000000315_1']['row']) is not 0: items = data["Grid_20151128000000000315_1"]["row"] return pd.DataFrame(items) df_list = list(map(lambda x: func(x), self.code_list)) print(df_list) def concat(x, y): if x.empty: return x elif y.empty: return x else: return pd.concat([x, y]) full_df = reduce(lambda x, y: concat(x, y), df_list) full_df.drop(["ROW_NUM"], axis=1, inplace=True) print(full_df) load_key = "public_data/open_data_raw_material_price/origin/csv/{filename}.csv".format(filename="20205040") #self.today) manager = S3Manager(bucket_name="production-bobsim") manager.save_df_to_csv(df=full_df, key=load_key)
def __init__(self, bucket_name: str, x_train, y_train, params=None): # logger self.logger = init_logger() # s3 self.s3_manager = S3Manager(bucket_name=bucket_name) if params is None: self.model = ElasticNet() else: self.model = ElasticNet(**params) self.x_train, self.y_train = x_train, y_train self.error = None self.metric = None
def __init__(self, base_url, bucket_name, key, head=False): self.logger = init_logger() self.bucket_name = bucket_name self.s3_manager = S3Manager(bucket_name=self.bucket_name) self.prefix = key self.chrome_path = "C:/chromedriver" options = webdriver.ChromeOptions() if head is False: options.add_argument('headless') self.driver = webdriver.Chrome(executable_path=self.chrome_path, chrome_options=options) self.base_url = base_url
def main(): """ save list of non-sparse item names (std_list). :return: exit code """ # get standard list bucket_name = "production-bobsim" df, key = build_origin_price(bucket_name=bucket_name, date="201908") std_list = get_std_list(column=df["standard_item_name"], number=48) # print(std_list) # save standard list s3_manager = S3Manager(bucket_name=bucket_name) s3_manager.save_dump( x=std_list, key="food_material_price_predict_model/constants/std_list.pkl") return 0
def __init__(self, bucket_name: str, date: str): self.logger = init_logger() self.date = date # s3 # TODO: bucket_name -> parameterized self.s3_manager = S3Manager(bucket_name=bucket_name) self.load_key = "public_data/open_data_raw_material_price/origin/csv/{filename}.csv".format( filename=self.date) self.save_key = "public_data/open_data_raw_material_price/process/csv/{filename}.csv".format( filename=self.date) self.dtypes = dtype["raw_material_price"] self.translate = translation["raw_material_price"] # load filtered df self.input_df = self.load()
def get_recipes(prefix, source): data = S3Manager("production-bobsim").fetch_dict_from_json( key="crawled_{p}/{s}".format(p=prefix, s=source)) if data is None: return 'there is no data' return jsonify(data)
def save_to_s3(transformer, bucket_name, key): with tempfile.TemporaryFile() as fp: dump(transformer, fp) fp.seek(0) S3Manager(bucket_name=bucket_name).save_object(body=fp.read(), key=key) fp.close()
def inverse_price(self, price): manager = S3Manager(bucket_name=self.bucket_name) mean, std = manager.load_dump( key="food_material_price_predict_model/price_(mean,std)_{date}.pkl".format(date=self.date) ) return price * std + mean
def save_coef(self, bucket_name, key): S3Manager(bucket_name=bucket_name).save_df_to_csv(self.coef_df, key=key)
def save_model(self, bucket_name, key): manager = S3Manager(bucket_name=bucket_name) manager.save_dump(self.model, key=key)
def save(self, df: pd.DataFrame): manager = S3Manager(bucket_name=self.bucket_name) manager.save_df_to_csv(df=df, key=self.save_key)