def main(): opt = parse_args() if (opt.shuffle > 0): raise AssertionError("-shuffle is not implemented, please make sure \ you shuffle your data before pre-processing.") init_logger(opt.log_file) logger.info("Input args: %r", opt) logger.info("Extracting features...") logger.info("Building `Fields` object...") fields = get_fields() task1_fields = get_task_fields() task2_fields = get_task2_fields() logger.info("Building & saving task training data...") train_dataset_files = build_save_dataset('train', 'task', fields, opt) logger.info("Building & saving task2 training data...") train_dataset_files2 = build_save_dataset('train', 'task2', fields, opt) logger.info("Building & saving task validation data...") build_save_dataset('valid', 'task', fields, opt) logger.info("Building & saving task2 validation data...") build_save_dataset('valid', 'task2', fields, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files + train_dataset_files2, fields, opt)
def main(opt, device_id): opt = training_opt_postprocessing(opt, device_id) init_logger(opt.log_file) # Load checkpoint if we resume from a previous training. if opt.train_from: logger.info('Loading checkpoint from %s' % opt.train_from) checkpoint = torch.load(opt.train_from, map_location=lambda storage, loc: storage) # Load default opts values then overwrite it with opts from # the checkpoint. It's usefull in order to re-train a model # after adding a new option (not set in checkpoint) dummy_parser = configargparse.ArgumentParser() opts.model_opts(dummy_parser) default_opt = dummy_parser.parse_known_args([])[0] model_opt = default_opt model_opt.__dict__.update(checkpoint['opt'].__dict__) else: checkpoint = None model_opt = opt # Load fields generated from preprocess phase. fields = load_fields(opt, checkpoint) # Build model. model = build_model(model_opt, opt, fields, checkpoint) n_params, enc, dec = _tally_parameters(model) logger.info('encoder: %d' % enc) logger.info('decoder: %d' % dec) logger.info('* number of parameters: %d' % n_params) _check_save_model_path(opt) # Build optimizer. optim = build_optim(model, opt, checkpoint) # Build model saver model_saver = build_model_saver(model_opt, opt, model, fields, optim) trainer = build_trainer(opt, device_id, model, fields, optim, model_saver=model_saver) def train_iter_fct(): return build_dataset_iter( load_dataset("train", opt), fields, opt) def valid_iter_fct(): return build_dataset_iter( load_dataset("valid", opt), fields, opt, is_train=False) # Do training. if len(opt.gpu_ranks): logger.info('Starting training on GPU: %s' % opt.gpu_ranks) else: logger.info('Starting training on CPU, could be very slow') trainer.train(train_iter_fct, valid_iter_fct, opt.train_steps, opt.valid_steps) if opt.tensorboard: trainer.report_manager.tensorboard_writer.close()
def init(mode): """ NASA Bot config and logging init """ if os.path.exists(f".env.{mode}"): init_root_cfg(f".env.{mode}") logger_cfg = get_logger_cfg() init_logger(log_lvl=logger_cfg['log_lvl'], log_dir=logger_cfg['log_dir']) else: sys.exit('No .env file present')
def series_plot(ser: pd.Series, kind="bar", y_label="y", x_label="x", title="Untitled", d=0.1): plt.xlabel(x_label) plt.ylabel(y_label) ser_max, ser_min = ser.max(), ser.min() init_logger().info("min/max: {min}/{max}".format(min=ser_min, max=ser_max)) delta = (ser_max - ser_min) * d ser.plot(kind=kind, ylim=(ser_min - delta, ser_max + delta), title=title) plt.show()
def __init__(self, df=None, strategy: dict = None): """ TODO : overwrite to self.strategy :param strategy: dictionary { "method": [] list of columns } """ if df is not None and strategy is not None: self.logger = init_logger() self.input_df = df self.strategy = strategy # constant self.transform_method = { "log": FunctionTransformer(np.log1p), "standard": StandardScaler(), "one_hot_encoding": OneHotEncoder(sparse=False), "none": FunctionTransformer(lambda x: x) } # self.column_transformer = ColumnTransformer(transformers=self.make_transformers()) self.transformed = None super().__init__(transformers=self.make_transformers()) else: super().__init__()
def __init__( self, x_train, y_train, bucket_name, grid_params=None, score=mean_squared_error ): if grid_params is None: grid_params = { "max_iter": [1, 5, 10], "alpha": [0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100], "l1_ratio": np.arange(0.0, 1.0, 0.1) } self.x_train = x_train self.y_train = y_train self.scorer = score self.error = None # pd.Series self.metric = None # s3 self.s3_manager = S3Manager(bucket_name=bucket_name) # logger self.logger = init_logger() super().__init__( estimator=ElasticNet(), param_grid=grid_params, scoring=make_scorer(self.scorer, greater_is_better=False), # we have to know the relationship before and after obviously, so n_splits: 2 cv=TimeSeriesSplit(n_splits=2).split(self.x_train) )
def __init__(self, bucket_name: str, logger_name: str, date: str): self.logger = init_logger() self.date = date # TODO: now -> term of dataset self.term = datetime.datetime.now().strftime("%m%Y") # s3 self.bucket_name = bucket_name
def __init__(self, args, device_id): """ :param args: parser.parse_args() :param device_id: 0 or -1 """ self.args = args self.device_id = device_id self.model_flags = [ 'hidden_size', 'ff_size', 'heads', 'inter_layers', 'encoder', 'ff_actv', 'use_interval', 'rnn_size' ] self.device = "cpu" if self.args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % self.device_id) logger.info('Device %s' % self.device) torch.manual_seed(self.args.seed) random.seed(self.args.seed) if self.device_id >= 0: torch.cuda.set_device(self.device_id) init_logger(args.log_file) try: self.step = int(self.args.test_from.split('.')[-2].split('_')[-1]) except IndexError: self.step = 0 logger.info('Loading checkpoint from %s' % self.args.test_from) checkpoint = torch.load(self.args.test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if k in self.model_flags: setattr(self.args, k, opt[k]) config = BertConfig.from_json_file(self.args.bert_config_path) self.model = model_builder.Summarizer(self.args, self.device, load_pretrained_bert=False, bert_config=config) self.model.load_cp(checkpoint) self.model.eval()
def __init__(self, bucket_name: str, date: str): self.logger = init_logger() # s3 self.bucket_name = bucket_name self.load_key = "public_data/open_data_raw_material_price/process/csv/{filename}.csv".format( filename=date) # TODO: not loaded here. extractor just do preprocess data self.input_df = self.load()
def __init__(self, bucket_name: str, date: str): self.logger = init_logger() # s3 self.bucket_name = bucket_name self.load_key = "public_data/open_data_terrestrial_weather/process/csv/{filename}.csv".format( filename=date) self.input_df = self.load() self.categorical_features = []
def __init__(self): self.logger = init_logger() # load a prepared data and split test & train prepared_data = PriceDataPipeline().process() X = prepared_data.drop("당일조사가격", axis=1, inplace=False) y = prepared_data["당일조사가격"] self.logger.debug( X.groupby("품목명").count().sort_values(by=["평균기온(°C)"])) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=0.2, stratify=X["품목명"])
def main(): opt = parse_args() if (opt.shuffle > 0): raise AssertionError("-shuffle is not implemented, please make sure \ you shuffle your data before pre-processing.") init_logger(opt.log_file) logger.info("Input args: %r", opt) logger.info("Extracting features...") logger.info("Building 'Fields' object...") fields = get_fields() logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) # 返回生成的文件列表 logger.info("Building & saving validation data...") build_save_dataset('valid', fields, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt) # only用train集创建vocabulary
def __init__(self, strategy=None, df=None): self.logger = init_logger() self.input_df = df self.strategy = strategy self.fillna_method = { "drop": self.fillna_with_drop, "zero": self.fillna_with_zero, "linear": self.fillna_with_linear, }
def __init__(self, bucket_name): """ TODO: Add capability to process other formats (i.e. json, text, avro, parquet, etc.) :param bucket_name: AWS S3 bucket name """ self.logger = init_logger() self.bucket_name = bucket_name self.s3 = boto3.resource('s3') self.s3_bucket = self.s3.Bucket(bucket_name)
def draw_hist(s, h_type: str = "dist", name: str = None): h_method = { "dist": sns.distplot, "count": sns.countplot, } try: method = h_method[h_type] except KeyError: # TODO: handle exception init_logger().critical( "histogram type '{h_type}' is not supported".format(h_type=h_type)) sys.exit() if isinstance(s, pd.Series): plt.title('{name} histogram'.format(name=s.name)) method(s) plt.show() else: # for jupyter notebook plt.title('{name} histogram'.format(name=name)) return list(map(lambda series: method(series), s))
def __init__(self, args): self.logger = init_logger() self.type_map = { "price": "public_price", "terrestrial_weather": "public_terrestrial_weather", "marine_weather": "public_marine_weather" } self.args = args self.types = None self.df_list = None
def __init__(self, args, device_id): """ :param args: parser.parse_args() :param device_id: 0 or -1 """ self.args = args self.device_id = device_id self.model_flags = [ 'hidden_size', 'ff_size', 'heads', 'inter_layers', 'encoder', 'ff_actv', 'use_interval', 'rnn_size' ] self.device = "cpu" if self.args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % self.device_id) logger.info('Device %s' % self.device) torch.manual_seed(self.args.seed) random.seed(self.args.seed) if self.device_id >= 0: torch.cuda.set_device(self.device_id) init_logger(args.log_file)
def main(): opt = parse_args() #get the opt augment if (opt.shuffle > 0): raise AssertionError("-shuffle is not implemented, please make sure \ you shuffle your data before pre-processing.") init_logger(opt.log_file) logger.info("Input args: %r", opt) logger.info("Extracting features...") logger.info("Building `Fields` object...") fields = get_fields( ) #get the dict ,it save the torchtext.data src ,target &indices without data # x_train, x_valid, x_test, y_train_emo, y_valid_emo, y_test_emo = data_loader.test_mosei_emotion_data() logger.info("Building & saving training data...") train_dataset_files = build_save_dataset( 'train', fields, opt) #shard the source retutn the shard file paths logger.info("Building & saving validation data...") build_save_dataset('valid', fields, opt) ##shard the source retutn the shard file paths logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt)
def main(): opt = parse_args() if (opt.shuffle > 0): raise AssertionError("-shuffle is not implemented, please make sure \ you shuffle your data before pre-processing.") print(opt) # 全部日志写入file以及console init_logger(opt.log_file) logger.info("Extracting features...") logger.info("Building `Fields` object...") fields = get_fields() logger.info("Building & saving training data...") train_dataset_files = build_save_dataset('train', fields, opt) logger.info("Building & saving validation data...") build_save_dataset('valid', fields, opt) logger.info("Building & saving vocabulary...") build_save_vocab(train_dataset_files, fields, opt)
def multi_card_run(self): """ Spawns 1 process per GPU """ init_logger() nb_gpu = self.args.world_size mp = torch.multiprocessing.get_context('spawn') # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing. process = [] for i in range(nb_gpu): self.device_id = i process.append( mp.Process(target=self.multi_card_train, args=(self.args, self.device_id, error_queue), daemon=True)) process[i].start() logger.info(" Starting process pid: %d " % process[i].pid) error_handler.add_child(process[i].pid) for p in process: p.join()
def __init__(self, base_url, bucket_name, key, head=False): self.logger = init_logger() self.bucket_name = bucket_name self.s3_manager = S3Manager(bucket_name=self.bucket_name) self.prefix = key self.chrome_path = "C:/chromedriver" options = webdriver.ChromeOptions() if head is False: options.add_argument('headless') self.driver = webdriver.Chrome(executable_path=self.chrome_path, chrome_options=options) self.base_url = base_url
def main(_): begin = time.time() tf.gfile.MakeDirs(FLAGS.model_dir) # redirects tf logs to file log_file = logging.init_logger(FLAGS.model_dir, FLAGS.do_debug) config.display_args(FLAGS) if FLAGS.model == "bert": run_bert_classifier(log_file) else: E = DRSCExperiment(FLAGS) E.run() tf.logging.info("Execution Time: {:.2f}s".format(time.time() - begin))
def __init__(self, bucket_name: str, x_train, y_train, params=None): # logger self.logger = init_logger() # s3 self.s3_manager = S3Manager(bucket_name=bucket_name) if params is None: self.model = ElasticNet() else: self.model = ElasticNet(**params) self.x_train, self.y_train = x_train, y_train self.error = None self.metric = None
def __init__(self, key): self.key = key self.logger = init_logger() # s3 self.bucket_name = "production-bobsim" self.s3_key = "public_data/{dir}/origin".format(dir=self.key) # valid check self.dtypes = dtype[key] self.columns = list(self.dtypes.keys()) # rdb self.schema_name = "public_data" self.table_name = self.key # return self.df = None
def __init__(self, bucket_name: str, date: str): self.logger = init_logger() self.date = date # s3 # TODO: bucket_name -> parameterized self.s3_manager = S3Manager(bucket_name=bucket_name) self.load_key = "public_data/open_data_raw_material_price/origin/csv/{filename}.csv".format( filename=self.date) self.save_key = "public_data/open_data_raw_material_price/process/csv/{filename}.csv".format( filename=self.date) self.dtypes = dtype["raw_material_price"] self.translate = translation["raw_material_price"] # load filtered df self.input_df = self.load()
def __init__(self, bucket_name: str, date: str): self.logger = init_logger() # TODO: how to handle datetime? self.term = datetime.strptime(date, "%Y%m") # s3 self.bucket_name = bucket_name self.file_name = "2014-2020.csv" self.load_key = "public_data/open_data_marine_weather/origin/csv/{filename}".format( filename=self.file_name) self.save_key = "public_data/open_data_marine_weather/process/csv/{filename}.csv".format( filename=date) # type self.dtypes = dtype["marine_weather"] self.translate = translation["marine_weather"] # fillna self.columns_with_linear = [ "m_wind_spd_avg", "m_atm_press_avg", "m_rel_hmd_avg", "m_temper_avg", "m_water_temper_avg", "m_max_wave_h_avg", "m_sign_wave_h_avg", "m_sign_wave_h_high", "m_max_wave_h_high" ] self.columns_with_zero = ['m_wave_p_avg', 'm_wave_p_high'] """ self.columns_with_linear = [ "m_atm_press_avg", "m_rel_hmd_avg", "m_temper_avg", "m_water_temper_avg", "m_max_wave_h_avg", "m_max_wave_h_high" ] """ # self.columns_with_zero = ['m_wave_p_avg'] self.columns_with_drop = ['date'] # load filtered df and take certain term df = self.load() # TODO: make function date_picker = (df['date'].dt.year == self.term.year) & ( df['date'].dt.month == self.term.month) self.input_df = df[date_picker]
# tgt_iter = None translator.translate(src_data_iter=src_iter, tgt_data_iter=tgt_iter, batch_size=opt.batch_size, out_file=out_file) out_file.close() if __name__ == "__main__": parser = configargparse.ArgumentParser( description='translate.py', config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter) opts.config_opts(parser) opts.translate_opts(parser) opt = parser.parse_args() logger = init_logger(opt.log_file) logger.info("Input args: %r", opt) path = 'rein_model/rein_model_step' for i in range(0, 25000, 10): current_path = path + '_' + str(i) + '.pt' if os.path.exists(current_path): model_path = current_path opt.output = 'rein_data/rein.tran' + '_' + str(i) main(opt, model_path) else: continue
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function # import argparse from utils.args import print_arguments from utils.logging import init_logger from utils.check import check_gpu from networks.graphsum.run_graphsum import main as run_graphsum from run_args import parser as run_parser if __name__ == "__main__": args = run_parser.parse_args() print_arguments(args) init_logger(args.log_file) check_gpu(args.use_cuda) if args.model_name == 'graphsum': run_graphsum(args) else: raise ValueError("Model %s is not supported currently!" % args.model_name)
def main(): logger = init_logger() app = Flask(__name__) app.config['JSON_AS_ASCII'] = False app.config['JSON_SORT_KEYS'] = False # TODO: classify crawl recipe API service @app.route('/', methods=['GET']) def index(): return "<h2>Crawling Service</h2>\ <h3>Recipe</h3>\ <strong>Mangae</strong><br>\ [GET] : /crawl_recipe/M?str_num=6932924&end_num=6932926<br>\ [GET] : /recipe/M<br><br>\ <strong>Haemuk</strong><br>\ [GET] : /crawl_recipe/H?str_num=5004&end_num=5005<br>\ [GET] : /recipe/H<br><br>\ <h3>Item</h3>\ <strong>Emart</strong><br>\ [GET] : /crawl_item/E<br>\ [GET] : /item/E<br><br>\ <strong>Haemuk</strong><br>\ [GET] : /crawl_item/H<br>\ [GET] : /item/H<br><br>" @app.route('/crawl_recipe/<source>', methods=['GET']) def crawl_recipe(source): """ :return: jsonified recipe """ args = request.args try: str_num, end_num = args["str_num"], args["end_num"] except KeyError: logger.warning("There is no parameter, 'str_num' or 'end_num'") str_num, end_num = 6934386, 6934390 logger.info("let's crawl {str} ~ {end} {source} recipes".format( str=str_num, end=end_num, source=source)) bucket_name = "production-bobsim" key = "crawled_recipe/{s}".format(s=source) candidate_num = range(int(str_num), int(end_num)) field = ['title', 'items', "duration", "tags"] if source == "M": result = MangaeRecipeCrawler( base_url="https://www.10000recipe.com/recipe", candidate_num=candidate_num, field=field, bucket_name=bucket_name, key=key).process() elif source == "H": result = HaemukRecipeCrawler( base_url="https://www.haemukja.com/recipes", candidate_num=candidate_num, field=field, bucket_name=bucket_name, key=key).process() else: raise NotImplementedError return jsonify(result) @app.route('/crawl_item/<source>', methods=['GET']) def crawl_item(source): """ :return: jsonified recipe """ logger.info("let's crawl item categories {source} recipes".format( source=source)) bucket_name = "production-bobsim" key = "crawled_item/{s}".format(s=source) head = True if source == "H": result = HaemukItemCrawler( base_url="https://www.haemukja.com/refrigerator", bucket_name=bucket_name, key=key, head=head).process() elif source == "E": result = HaemukItemCrawler(base_url="http://emart.ssg.com/", bucket_name=bucket_name, key=key, head=head).process() else: raise NotImplementedError return jsonify(result) @app.route('/<prefix>/<source>', methods=['GET']) def get_recipes(prefix, source): data = S3Manager("production-bobsim").fetch_dict_from_json( key="crawled_{p}/{s}".format(p=prefix, s=source)) if data is None: return 'there is no data' return jsonify(data) app.run(host='0.0.0.0', port=9000, debug=True)
""" Adopted from AllenNLP: https://github.com/allenai/allennlp/tree/v0.6.1/allennlp/common Functions and exceptions for checking that AllenNLP and its models are configured correctly. """ from torch import cuda from utils import logging logger = logging.init_logger() # pylint: disable=invalid-name class ConfigurationError(Exception): """ The exception raised by any AllenNLP object when it's misconfigured (e.g. missing properties, invalid properties, unknown properties). """ def __init__(self, message): super(ConfigurationError, self).__init__() self.message = message def __str__(self): return repr(self.message) def log_pytorch_version_info(): import torch