def load(fname, directory=INIT_DIR, data_dir=None, output_name=None, output_dir=None): """ Load trained model from disk """ if output_dir == None: output_dir = directory if output_name == None: output_name = fname while os.path.exists(os.path.join(output_dir, output_name)+'.pkl'): output_name += '_ctd' setup, aux_info = TrainingSetup.load(fname, directory) interface = DataInterface(setup, (), **sub_dict(aux_info, ['particle', 'neg_samples'])) if data_dir is None: stem = '-'.join(fname.split('-', maxsplit=2)[:2]) data_dir = os.path.join(DATA_DIR,'{}-nodes'.format(stem)) trainer = Trainer(interface, data_dir = data_dir, output_name = os.path.join(output_dir, output_name), **sub_dict(aux_info, ['processes', 'epochs', 'minibatch', 'ent_burnin', 'pred_burnin'])) trainer.completed_files.extend(aux_info['completed_files']) return trainer
def _insert(item, p_date): keys = ('title', 'author', 'content_url', 'digest', 'cover', 'source_url') sub_data = utils.sub_dict(item, keys) post = Post(**sub_data) p_date = datetime.fromtimestamp(p_date) post["p_date"] = p_date logger.info('save data %s ' % post.title) try: post.save() except Exception as e: logger.error("保存失败 data=%s" % post.to_json(), exc_info=True)
def _insert(item): keys = ('companyFullName', 'jobNature', 'positionName', 'salary', 'financeStage', 'district', 'education', 'companySize', 'companyLabelList', 'skillLables', 'positionId', 'secondType', 'firstType', 'thirdType', 'createTime', 'city') sub_data = utils.sub_dict(item, keys) # print(sub_data) post = Post(**sub_data) print('save data %s ' % post.companyFullName) try: post.save() except Exception as e: print("保存失败 data=%s" % post.to_json(), exc_info=True)
def load(fname, directory=INIT_DIR, data_dir=None, output_name=None, output_dir=None): """ Load trained model from disk """ if output_dir == None: output_dir = directory if output_name == None: output_name = fname while os.path.exists( os.path.join(output_dir, output_name) + '.pkl'): output_name += '_ctd' setup, aux_info = TrainingSetup.load(fname, directory) interface = DataInterface( setup, **sub_dict(aux_info, ['particle', 'neg_samples'])) if data_dir is None: prefix, thresh = fname.split('-', maxsplit=2)[:2] data_dir = os.path.join(DATA_DIR, '{}-{}-nodes'.format(prefix, thresh)) trainer = Trainer( interface, data_dir=data_dir, output_name=os.path.join(output_dir, output_name), **sub_dict(aux_info, [ 'processes', 'epochs', 'minibatch', 'ent_burnin', 'pred_burnin' ])) trainer.completed_files.extend(aux_info['completed_files']) return trainer
def _insert(item, p_date): keys = ('title', 'author', 'content_url', 'digest', 'cover', 'source_url') # 过滤无用字段 sub_data = utils.sub_dict(item, keys) post = Post(**sub_data) # 日期转换 # 资料:http://www.wklken.me/posts/2015/03/03/python-base-datetime.html p_date = datetime.fromtimestamp(p_date) post["p_date"] = p_date # 打印信息 logger.info('save data %s' % post.title) try: post.save() except Exception as e: logger.error("保存失败 data=%s" % post.to_json(), exc_info=True)
def save(msg_list): """ msg_list 是字符串 "{\"list\":[{\"comm_msg_info\":{\"id.... 还需要经过字符反转义后,再用json转换成字典 """ # TODO 处理多图文 multi_app_msg_item_list msg_list = msg_list.replace(""", "\\\"").replace("\/", "/") data = json.loads(msg_list) posts = data.get("list") for item in posts: msg_info = item.get("app_msg_ext_info") if msg_info: keys = ('title', 'author', 'content_url', 'digest', 'cover', 'source_url') sub_data = utils.sub_dict(item.get("app_msg_ext_info"), keys) post = Post(**sub_data) logger.info('save data %s ' % post.title) try: post.save() except Exception as e: logger.error("保存失败 data=%s" % post.to_json(), exc_info=True) else: logger.warning(u"此消息不是图文推送,data=%s" % json.dumps(item.get("comm_msg_info")))
def setup_trainer(**kw): """ Setup a semantic function model, ready for training """ # Set input and output DATA = os.path.join(DATA_DIR, 'core-{}-nodes'.format(kw['thresh'])) output_template = os.path.join(OUT_DIR, 'core-{}-{}') if kw['suffix'] is None: kw['suffix'] = 1 while os.path.exists(output_template.format(kw['thresh'], kw['suffix'])+'.pkl'): kw['suffix'] += 1 OUTPUT = output_template.format(kw['thresh'], kw['suffix']) # Save under OUTPUT.pkl and OUTPUT.aux.pkl # Check the output path is clear if os.path.exists(OUTPUT+'.pkl'): raise Exception('File already exists') # Load vocab for model with open(os.path.join(AUX_DIR, VOCAB_FILE), 'rb') as f: preds = pickle.load(f) with open(os.path.join(AUX_DIR, FREQ_FILE), 'rb') as f: pred_freq = pickle.load(f) links = ['ARG1', 'ARG2'] # Ignore rare preds (e.g. if using core-100) for i in range(len(pred_freq)): if pred_freq[i] < kw['thresh']: pred_freq[i] = 0 # Set random seed, if specified if kw['seed']: np.random.seed(kw['seed']) # Set up model model_kwargs = sub_dict(kw, ["dims", "card", "init_bias", "init_card", "init_range", "init_ent_bias", "init_link_str", "init_verb_prop", "init_pat_prop", "init_ag_prop", "freq_alpha"]) if kw['model'] == 'independent': model_class = SemFuncModel_IndependentPreds elif kw['model'] == 'factorised': model_class = SemFuncModel_FactorisedPreds model_kwargs.update(sub_dict(kw, ["embed_dims"])) else: raise Exception('model class not recognised') model = model_class(preds, links, pred_freq, verbose=False, **model_kwargs) # Set up training hyperparameters setup_kwargs = sub_dict(kw, ["rate", "rate_ratio", "l2", "l2_ratio", "l2_ent", "l1", "l1_ratio", "l1_ent", "ent_steps", "pred_steps"]) if kw['setup'] == 'adagrad': setup_class = AdaGradTrainingSetup setup_kwargs.update(sub_dict(kw, ["ada_decay"])) elif kw['setup'] == 'adam': setup_class = AdamTrainingSetup setup_kwargs.update(sub_dict(kw, ["mean_decay", "var_decay"])) else: raise Exception('setup class not recognised') setup = setup_class(model, **setup_kwargs) # Set up training (without data) particle = create_particle(3,2,5) interface = DataInterface(setup, (), particle, neg_samples = kw['neg_samples']) trainer_kwargs = sub_dict(kw, ["processes", "epochs", "minibatch", "ent_burnin", "pred_burnin"]) trainer = Trainer(interface, data_dir = DATA, output_name = OUTPUT, **trainer_kwargs) return trainer
def setup_trainer(**kw): """ Setup a semantic function model, ready for training """ # Set input and output filepaths # Naming convention is <dataset>-<threshold>-<name> if kw['multipred']: prefix = 'multicore' else: prefix = 'core' thresh = kw['thresh'] suffix = kw['suffix'] # Directory for the data DATA = os.path.join(DATA_DIR, '{}-{}-nodes'.format(prefix, thresh)) output_template = os.path.join(OUT_DIR, '{}-{}-{}') # If no suffix is given, use the smallest integer for which no file exists if suffix is None: suffix = 1 while os.path.exists( output_template.format(prefix, thresh, suffix) + '.pkl'): suffix += 1 # Save under OUTPUT.pkl and OUTPUT.aux.pkl OUTPUT = output_template.format(prefix, thresh, suffix) # Check the output path is clear, unless overwriting is allowed if not kw['overwrite'] and os.path.exists(OUTPUT + '.pkl'): raise Exception( "File already exists - did you mean to use '-overwrite'?") # Load vocab for model with open(os.path.join(AUX_DIR, '{}-{}-vocab.pkl'.format(prefix, thresh)), 'rb') as f: preds = pickle.load(f) with open(os.path.join(AUX_DIR, '{}-{}-freq.pkl'.format(prefix, thresh)), 'rb') as f: pred_freq = pickle.load(f) links = ['ARG1', 'ARG2'] # Set random seed, if specified if kw['seed']: np.random.seed(kw['seed']) # Set up model # Get hyperparameters model_kwargs = sub_dict(kw, [ "dims", "card", "init_bias", "init_card", "init_range", "init_ent_bias", "init_link_str", "init_verb_prop", "init_pat_prop", "init_ag_prop", "freq_alpha" ]) # Choose model class if kw['model'] == 'independent': if kw['multipred']: model_class = SemFuncModel_MultiIndependentPreds else: model_class = SemFuncModel_IndependentPreds elif kw['model'] == 'factorised': raise ValueError('factorised pred model is deprecated') #model_class = SemFuncModel_FactorisedPreds #model_kwargs.update(sub_dict(kw, ["embed_dims"])) else: raise ValueError('model class not recognised') # Initialise model model = model_class(preds, links, pred_freq, verbose=False, **model_kwargs) # Set up gradient descent algorithm # Get hyperparameters setup_kwargs = sub_dict(kw, [ "rate", "rate_ratio", "l2", "l2_ratio", "l2_ent", "l1", "l1_ratio", "l1_ent", "ent_steps", "pred_steps" ]) # Choose training setup class if kw['setup'] == 'adagrad': setup_class = AdaGradTrainingSetup setup_kwargs.update(sub_dict(kw, ["ada_decay"])) elif kw['setup'] == 'adam': setup_class = AdamTrainingSetup setup_kwargs.update(sub_dict(kw, ["mean_decay", "var_decay"])) else: raise ValueError('setup class not recognised') # Initialise training setup setup = setup_class(model, **setup_kwargs) # Set up trainer (without data) # Initialise particle particle = create_particle(*kw['particle']) # Initialise data interface interface = DataInterface(setup, particle, neg_samples=kw['neg_samples']) # Get hyperparameters trainer_kwargs = sub_dict( kw, ["processes", "epochs", "minibatch", "ent_burnin", "pred_burnin"]) # Initialise trainer trainer = Trainer(interface, data_dir=DATA, output_name=OUTPUT, **trainer_kwargs) return trainer