Пример #1
0
 def load(fname, directory=INIT_DIR, data_dir=None, output_name=None, output_dir=None):
     """
     Load trained model from disk
     """
     if output_dir == None:
         output_dir = directory
     if output_name == None:
         output_name = fname
         while os.path.exists(os.path.join(output_dir, output_name)+'.pkl'):
             output_name += '_ctd'
     
     setup, aux_info = TrainingSetup.load(fname, directory)
     
     interface = DataInterface(setup, (),
                               **sub_dict(aux_info, ['particle',
                                                     'neg_samples']))
     
     if data_dir is None:
         stem = '-'.join(fname.split('-', maxsplit=2)[:2])
         data_dir = os.path.join(DATA_DIR,'{}-nodes'.format(stem))
     
     trainer = Trainer(interface,
                       data_dir = data_dir,
                       output_name = os.path.join(output_dir, output_name),
                       **sub_dict(aux_info, ['processes',
                                             'epochs',
                                             'minibatch',
                                             'ent_burnin',
                                             'pred_burnin']))
     
     trainer.completed_files.extend(aux_info['completed_files'])
     
     return trainer
Пример #2
0
 def _insert(item, p_date):
     keys = ('title', 'author', 'content_url', 'digest', 'cover', 'source_url')
     sub_data = utils.sub_dict(item, keys)
     post = Post(**sub_data)
     p_date = datetime.fromtimestamp(p_date)
     post["p_date"] = p_date
     logger.info('save data %s ' % post.title)
     try:
         post.save()
     except Exception as e:
         logger.error("保存失败 data=%s" % post.to_json(), exc_info=True)
Пример #3
0
def _insert(item):
    keys = ('companyFullName', 'jobNature', 'positionName', 'salary',
            'financeStage', 'district', 'education', 'companySize',
            'companyLabelList', 'skillLables', 'positionId', 'secondType',
            'firstType', 'thirdType', 'createTime', 'city')
    sub_data = utils.sub_dict(item, keys)
    # print(sub_data)
    post = Post(**sub_data)
    print('save data %s ' % post.companyFullName)
    try:
        post.save()
    except Exception as e:
        print("保存失败 data=%s" % post.to_json(), exc_info=True)
Пример #4
0
    def load(fname,
             directory=INIT_DIR,
             data_dir=None,
             output_name=None,
             output_dir=None):
        """
        Load trained model from disk
        """
        if output_dir == None:
            output_dir = directory
        if output_name == None:
            output_name = fname
            while os.path.exists(
                    os.path.join(output_dir, output_name) + '.pkl'):
                output_name += '_ctd'

        setup, aux_info = TrainingSetup.load(fname, directory)

        interface = DataInterface(
            setup, **sub_dict(aux_info, ['particle', 'neg_samples']))

        if data_dir is None:
            prefix, thresh = fname.split('-', maxsplit=2)[:2]
            data_dir = os.path.join(DATA_DIR,
                                    '{}-{}-nodes'.format(prefix, thresh))

        trainer = Trainer(
            interface,
            data_dir=data_dir,
            output_name=os.path.join(output_dir, output_name),
            **sub_dict(aux_info, [
                'processes', 'epochs', 'minibatch', 'ent_burnin', 'pred_burnin'
            ]))

        trainer.completed_files.extend(aux_info['completed_files'])

        return trainer
Пример #5
0
 def _insert(item, p_date):
     keys = ('title', 'author', 'content_url', 'digest', 'cover',
             'source_url')
     # 过滤无用字段
     sub_data = utils.sub_dict(item, keys)
     post = Post(**sub_data)
     # 日期转换
     # 资料:http://www.wklken.me/posts/2015/03/03/python-base-datetime.html
     p_date = datetime.fromtimestamp(p_date)
     post["p_date"] = p_date
     # 打印信息
     logger.info('save data %s' % post.title)
     try:
         post.save()
     except Exception as e:
         logger.error("保存失败 data=%s" % post.to_json(), exc_info=True)
Пример #6
0
 def save(msg_list):
     """
      msg_list 是字符串
     "{\"list\":[{\"comm_msg_info\":{\"id....
     还需要经过字符反转义后,再用json转换成字典
     """
     # TODO 处理多图文 multi_app_msg_item_list
     msg_list = msg_list.replace(""", "\\\"").replace("\/", "/")
     data = json.loads(msg_list)
     posts = data.get("list")
     for item in posts:
         msg_info = item.get("app_msg_ext_info")
         if msg_info:
             keys = ('title', 'author', 'content_url', 'digest', 'cover', 'source_url')
             sub_data = utils.sub_dict(item.get("app_msg_ext_info"), keys)
             post = Post(**sub_data)
             logger.info('save data %s ' % post.title)
             try:
                 post.save()
             except Exception as e:
                 logger.error("保存失败 data=%s" % post.to_json(), exc_info=True)
         else:
             logger.warning(u"此消息不是图文推送,data=%s" % json.dumps(item.get("comm_msg_info")))
Пример #7
0
def setup_trainer(**kw):
    """
    Setup a semantic function model, ready for training
    """
    # Set input and output
    DATA = os.path.join(DATA_DIR, 'core-{}-nodes'.format(kw['thresh']))
    
    output_template = os.path.join(OUT_DIR, 'core-{}-{}')
    
    if kw['suffix'] is None:
        kw['suffix'] = 1
        while os.path.exists(output_template.format(kw['thresh'], kw['suffix'])+'.pkl'):
            kw['suffix'] += 1
    
    OUTPUT = output_template.format(kw['thresh'], kw['suffix'])
    # Save under OUTPUT.pkl and OUTPUT.aux.pkl
    
    # Check the output path is clear
    if os.path.exists(OUTPUT+'.pkl'):
        raise Exception('File already exists')
    
    # Load vocab for model
    with open(os.path.join(AUX_DIR, VOCAB_FILE), 'rb') as f:
        preds = pickle.load(f)
    with open(os.path.join(AUX_DIR, FREQ_FILE), 'rb') as f:
        pred_freq = pickle.load(f)
    links = ['ARG1', 'ARG2']
    
    # Ignore rare preds (e.g. if using core-100)
    for i in range(len(pred_freq)):
        if pred_freq[i] < kw['thresh']:
            pred_freq[i] = 0
    
    # Set random seed, if specified
    if kw['seed']:
        np.random.seed(kw['seed'])
    
    # Set up model
    model_kwargs = sub_dict(kw, ["dims",
                                 "card",
                                 "init_bias",
                                 "init_card",
                                 "init_range",
                                 "init_ent_bias",
                                 "init_link_str",
                                 "init_verb_prop",
                                 "init_pat_prop",
                                 "init_ag_prop",
                                 "freq_alpha"])
    if kw['model'] == 'independent':
        model_class = SemFuncModel_IndependentPreds
    elif kw['model'] == 'factorised':
        model_class = SemFuncModel_FactorisedPreds
        model_kwargs.update(sub_dict(kw, ["embed_dims"]))
    else:
        raise Exception('model class not recognised')
    model = model_class(preds, links, pred_freq, verbose=False, **model_kwargs)
    
    # Set up training hyperparameters
    setup_kwargs = sub_dict(kw, ["rate",
                                 "rate_ratio",
                                 "l2",
                                 "l2_ratio",
                                 "l2_ent",
                                 "l1",
                                 "l1_ratio",
                                 "l1_ent",
                                 "ent_steps",
                                 "pred_steps"])
    if kw['setup'] == 'adagrad':
        setup_class = AdaGradTrainingSetup
        setup_kwargs.update(sub_dict(kw, ["ada_decay"]))
    elif kw['setup'] == 'adam':
        setup_class = AdamTrainingSetup
        setup_kwargs.update(sub_dict(kw, ["mean_decay",
                                          "var_decay"]))
    else:
        raise Exception('setup class not recognised')
    setup = setup_class(model, **setup_kwargs)
    
    # Set up training (without data)
    particle = create_particle(3,2,5)
    interface = DataInterface(setup, (),
                              particle,
                              neg_samples = kw['neg_samples'])
    
    trainer_kwargs = sub_dict(kw, ["processes",
                                   "epochs",
                                   "minibatch",
                                   "ent_burnin",
                                   "pred_burnin"])
    
    trainer = Trainer(interface,
                      data_dir = DATA,
                      output_name = OUTPUT,
                      **trainer_kwargs)
    
    return trainer
Пример #8
0
def setup_trainer(**kw):
    """
    Setup a semantic function model, ready for training
    """
    # Set input and output filepaths
    # Naming convention is <dataset>-<threshold>-<name>

    if kw['multipred']:
        prefix = 'multicore'
    else:
        prefix = 'core'
    thresh = kw['thresh']
    suffix = kw['suffix']

    # Directory for the data
    DATA = os.path.join(DATA_DIR, '{}-{}-nodes'.format(prefix, thresh))

    output_template = os.path.join(OUT_DIR, '{}-{}-{}')

    # If no suffix is given, use the smallest integer for which no file exists
    if suffix is None:
        suffix = 1
        while os.path.exists(
                output_template.format(prefix, thresh, suffix) + '.pkl'):
            suffix += 1

    # Save under OUTPUT.pkl and OUTPUT.aux.pkl
    OUTPUT = output_template.format(prefix, thresh, suffix)

    # Check the output path is clear, unless overwriting is allowed
    if not kw['overwrite'] and os.path.exists(OUTPUT + '.pkl'):
        raise Exception(
            "File already exists - did you mean to use '-overwrite'?")

    # Load vocab for model
    with open(os.path.join(AUX_DIR, '{}-{}-vocab.pkl'.format(prefix, thresh)),
              'rb') as f:
        preds = pickle.load(f)
    with open(os.path.join(AUX_DIR, '{}-{}-freq.pkl'.format(prefix, thresh)),
              'rb') as f:
        pred_freq = pickle.load(f)
    links = ['ARG1', 'ARG2']

    # Set random seed, if specified
    if kw['seed']:
        np.random.seed(kw['seed'])

    # Set up model

    # Get hyperparameters
    model_kwargs = sub_dict(kw, [
        "dims", "card", "init_bias", "init_card", "init_range",
        "init_ent_bias", "init_link_str", "init_verb_prop", "init_pat_prop",
        "init_ag_prop", "freq_alpha"
    ])
    # Choose model class
    if kw['model'] == 'independent':
        if kw['multipred']:
            model_class = SemFuncModel_MultiIndependentPreds
        else:
            model_class = SemFuncModel_IndependentPreds
    elif kw['model'] == 'factorised':
        raise ValueError('factorised pred model is deprecated')
        #model_class = SemFuncModel_FactorisedPreds
        #model_kwargs.update(sub_dict(kw, ["embed_dims"]))
    else:
        raise ValueError('model class not recognised')
    # Initialise model
    model = model_class(preds, links, pred_freq, verbose=False, **model_kwargs)

    # Set up gradient descent algorithm

    # Get hyperparameters
    setup_kwargs = sub_dict(kw, [
        "rate", "rate_ratio", "l2", "l2_ratio", "l2_ent", "l1", "l1_ratio",
        "l1_ent", "ent_steps", "pred_steps"
    ])
    # Choose training setup class
    if kw['setup'] == 'adagrad':
        setup_class = AdaGradTrainingSetup
        setup_kwargs.update(sub_dict(kw, ["ada_decay"]))
    elif kw['setup'] == 'adam':
        setup_class = AdamTrainingSetup
        setup_kwargs.update(sub_dict(kw, ["mean_decay", "var_decay"]))
    else:
        raise ValueError('setup class not recognised')
    # Initialise training setup
    setup = setup_class(model, **setup_kwargs)

    # Set up trainer (without data)

    # Initialise particle
    particle = create_particle(*kw['particle'])
    # Initialise data interface
    interface = DataInterface(setup, particle, neg_samples=kw['neg_samples'])
    # Get hyperparameters
    trainer_kwargs = sub_dict(
        kw, ["processes", "epochs", "minibatch", "ent_burnin", "pred_burnin"])
    # Initialise trainer
    trainer = Trainer(interface,
                      data_dir=DATA,
                      output_name=OUTPUT,
                      **trainer_kwargs)

    return trainer