示例#1
0
def mdr_falt_business(qg_info, name):
    qg_set = []
    sanlei = []
    IsMatchingFault = None
    IsADRorAccident_tag = None
    for SuperFaultName in qg_info:
        qg_sql = (
            "SELECT SuperClassName,subname FROM `mdr_deviceinstrument` where Name='%s'  limit 1" % name
        )
        rows_qg = mdrsql.mdr_select(qg_sql)
        if rows_qg:
            IsMatchingFault = u'是'
            for row_qg_data in rows_qg:
                s_name_1 = row_qg_data[0]
                s_name_2 = row_qg_data[1]
                compose_data = s_name_1+':'+s_name_2+':'+name
            accident_sql = (
                "replace into mdr_fault(SuperClassName,SubName,Name,NonStandardName,IsDeviceMatching,RealFaultName) "
                "values(%s,%s,%s,%s,%s,%s)"
            )
            accident_data = (s_name_1, s_name_2, name, None, u'是', SuperFaultName)
            mdrsql.mdr_insert_alone(accident_sql, accident_data)
            
            qg_set.append(SuperFaultName)
            qg_setdata = utils.data_set(qg_set)
            UnMatchFault = ""
            IsADRorAccident_tag = "2"
            sanlei.append(s_name_1)
            sanlei.append(s_name_2)
            sanlei.append(name)
            
        else:
            accident_sql = (
                "replace into mdr_fault(SuperClassName,SubName,Name,NonStandardName,IsDeviceMatching,RealFaultName) "
                "values(%s,%s,%s,%s,%s,%s)"
            )
            accident_data = (None, None, None, name, u'否', SuperFaultName)
            mdrsql.mdr_insert_alone(accident_sql, accident_data)
            IsMatchingFault = u'是'
            UnMatchFault = utils.str_to_unicode(SuperFaultName)
            qg_set.append(UnMatchFault)
            qg_setdata = utils.data_set(qg_set)
            sanlei.append(name)
            
        # qg_data = (
        #     data["BianMa"], data["ProvinceName"], data["District"], data["County"],
        #     data["ReportUnitName"], data["ReportUnitAddress"], data["ReportUnitTel"], data["Postalcode"],
        #     data["UnitType"], data["HappenDate"], data["KnowDate"], data["ReportDate"],
        #     data["ReportDate"], data["StateReportDate"], data["State"],StandardFault,
        #     StandardFault, IsMatchingFault, SuperFaultName, UnMatchFault)
        # qg_sql = (
        #         "replace into mdr_faultbusiness(BianMa,ProvinceName,District,County,ReportUnitName,ReportUnitAddress,ReportUnitTel,Postalcode,UnitType ,HappenDate,KnowDate,ReportDate,AcceptDate,StateReportDate,State,StandardFault,Name,IsMatchingFault,SuperFaultName,UnMatchFault)"
        #         "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        # )
        # mdrsql.mdr_insert_alone(qg_sql, qg_data)

    return (IsMatchingFault, IsADRorAccident_tag, qg_setdata, sanlei)
示例#2
0
def read_input_data(data_dir):
    train_url = os.path.join(data_dir, 'out_x20.txt')
    train_url_y = os.path.join(data_dir, 'out_y20.txt')
    test_url = os.path.join(data_dir, 'out_x_test20.txt')
    test_url_y = os.path.join(data_dir, 'out_y_test20.txt')
    train_set_y = utils.data_set_y(train_url_y)
    test_set_y = utils.data_set_y(test_url_y)
    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)

    train_set_y.extend(test_set_y)
    train_set.extend(test_set)
    train_count.extend(test_count)

    return train_set_y,train_set, train_count
示例#3
0
def prepare_data(bs, rate=0.95, sample=False):
    """Prepare the train-set and evaluation-set for the model.

    Parameters:
    ------------
    bs: batch_size
    rate: 从有标注数据中提取的用作训练集的数据比例

    Returns:
    --------
    train_set, eval_set: DataLoader"""
    import numpy as np
    text_a, text_b, label = load_data("../source/train.txt")
    if sample:
        corpus_a, corpus_b, corpus_label = sample_corpus(
            "../source/corpus.txt")
        text_a.extend(corpus_a)
        text_b.extend(corpus_b)
        label.extend(corpus_label)
    data = data_set(text_a, text_b, label)
    nSamples = int(len(data) * rate)
    selected = np.random.choice(len(data), nSamples, replace=False)
    ind_train = np.zeros(len(data), dtype=np.bool)
    ind_train[selected] = True
    ind_eval = np.ones(len(data), dtype=np.bool)
    ind_eval[selected] = False

    train_set = DataLoader(TensorDataset(*data[ind_train]), bs, shuffle=True)
    eval_set = DataLoader(TensorDataset(*data[ind_eval]),
                          2 * bs,
                          shuffle=False)
    return train_set, eval_set
示例#4
0
def test(sess, model, test_url, batch_size):
    test_set, test_count, _ = utils.data_set(test_url)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)
    loss_sum = 0.0
    kld_sum = 0.0
    ppx_sum = 0.0
    word_count = 0
    doc_count = 0
    for idx_batch in test_batches:
        data_batch, count_batch, mask = utils.fetch_data(
            test_set, test_count, idx_batch, FLAGS.vocab_size)
        input_feed = {model.x.name: data_batch, model.mask.name: mask}
        loss, kld = sess.run([model.objective, model.kld], input_feed)
        loss_sum += np.sum(loss)
        kld_sum += np.sum(kld) / np.sum(mask)
        word_count += np.sum(count_batch)
        count_batch = np.add(count_batch, 1e-12)
        ppx_sum += np.sum(np.divide(loss, count_batch))
        doc_count += np.sum(mask)
    print_ppx = np.exp(loss_sum / word_count)
    print_ppx_perdoc = np.exp(ppx_sum / doc_count)
    print_kld = kld_sum / len(test_batches)
    print('| Epoch test: {:d} |'.format(1),
          '| Perplexity: {:.9f}'.format(print_ppx),
          '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
          '| KLD: {:.5}'.format(print_kld))
示例#5
0
    def predict(
        self,
        file="../XuChuanyi_NJU_predict.txt",
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        self.eval()
        text_a, text_b = load_data("../source/test.txt")
        text_a.append(torch.zeros(32, dtype=torch.int))
        text_b.append(torch.zeros(32, dtype=torch.int))
        test_set = data_set(text_a, text_b)

        text_a, text_b, *_ = test_set[:-1]
        text_a.squeeze_(-1)
        text_b.squeeze_(-1)
        text_a.unsqueeze_(1)
        text_b.unsqueeze_(1)
        x = torch.cat((text_a, text_b), dim=1).to(dtype=torch.float32,
                                                  device=device)
        with torch.no_grad():
            pred = self.forward(x)
        with open(file, 'w') as obj:
            for label in pred:
                label = 1 if label.item() > 0.5 else 0
                label = str(label) + '\n'
                obj.write(label)
        return
示例#6
0
def get_sh_info(sh_info, data):
    sh_set = []
    is_adr = None
    clinicdetail_Name = None
    clinicdetail_SubID = None
    clinicsub_ID = None
    clinicsub_Name = None
    clinic_ID = None
    clinic_NAME = None
    IsADRorAccident_tag = ""
    
    for sh_info_item in sh_info:
        sh_query_sql = "SELECT clinicdetail.SubID, clinicdetail.Name,clinicsub.ID,clinicsub.Name,clinic.ID,clinic.NAME FROM clinicdetail, clinicsub,clinic WHERE clinicdetail.SubID=clinicsub.ID and clinicsub.PID=clinic.ID and clinicdetail.Name= '%s'  limit 1" %(sh_info_item)
        rows_sh = mdrsql.mdr_select(sh_query_sql)
        if rows_sh:
            is_adr = u'是'
            for row_sh_data in rows_sh:
                #sh_s_name = row_sh_data[0]
                clinicdetail_Name = row_sh_data[1]
                clinicdetail_SubID = row_sh_data[0]
                clinicsub_ID = row_sh_data[2]
                clinicsub_Name = row_sh_data[3]
                clinic_ID = row_sh_data[4]
                clinic_NAME = row_sh_data[5]

            sh_set.append(sh_info_item)
            adr_data_list = sh_info_item
            IsADRorAccident_tag = "1"
            _un_sh_info = ""
        else:
            is_adr = u'否'
            _un_sh_info = u"[非标准:" + sh_info_item + u"]"
            sh_set.append(_un_sh_info)
            adr_data_list = utils.data_set(sh_set)
            #sh_s_name = ""
            clinicdetail_Name = ""
            clinicdetail_SubID = ""
            clinicsub_ID = ""
            clinicsub_Name = ""
            clinic_ID = ""
            clinic_NAME = ""
            
    sh_data =     [
        data["BianMa"], data["ProvinceName"], data["District"], data["County"],
        data["ReportUnitName"], data["ReportUnitAddress"], data["ReportUnitTel"], data["Postalcode"], 
        data["UnitType"], data["HappenDate"], data["KnowDate"], data["ReportDate"], 
        data["ReportDate"], data["StateReportDate"], data["State"], is_adr, 
        clinicdetail_Name, clinicdetail_Name, clinicdetail_SubID, clinicsub_ID, 
        clinicsub_Name, clinic_ID, clinic_NAME, _un_sh_info]

    sh_sql = (
        "replace into mdr_adrbusiness(BianMa,ProvinceName,District,County,ReportUnitName,ReportUnitAddress,ReportUnitTel,Postalcode,UnitType ,HappenDate,KnowDate,ReportDate,AcceptDate,StateReportDate,State,IsMatchingADR,ADRStandardID,Name,SID1,SubID,SubName,PID,PName,UnMatchADR)"
        "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    )
    mdrsql.mdr_insert_alone(sh_sql, sh_data)
    
    return (is_adr, adr_data_list,IsADRorAccident_tag)
示例#7
0
def get_icd_info(data,icd_ok):
    icd_standname = ""
    icd_set = []
    IsMatchingAffect = None
    UnMatchAffect = None
    icd_setdata = None
    _un_tag_icd_1 = utils.str_to_unicode("[非标准:")
    _un_tag_icd_2 = utils.str_to_unicode("]")
    for item in icd_ok:
        #_item = item.strip(_trim_tag).strip()
        icd_sql = (
            "SELECT StandardIcdName,icd_a_name,icd_b_name,icd_c_name,PathName FROM `mdr_icd` where StandardIcdName='%s'  limit 1" %(item)
        )

        rows_icd = mdrsql.mdr_select(icd_sql)
        if rows_icd:
            IsMatchingAffect = u'是'
            UnMatchAffect = ""
            for icd_ok_info in rows_icd:
                icd_a_name = icd_ok_info[0]
                icd_b_name = icd_ok_info[1]
                AffectStandardName = icd_ok_info[2]
                PathName = icd_ok_info[3]

            icd_set.append(item)
            icd_setdata = item
            icd_standname = AffectStandardName
        else:
            IsMatchingAffect = u'否'
            un_icd_info = _un_tag_icd_1+item+_un_tag_icd_2
            _un_icd_info = utils.str_to_unicode(un_icd_info)
            UnMatchAffect = _un_icd_info
            icd_a_name = ""
            icd_b_name = ""
            AffectStandardName = ""
            PathName = ""
            icd_standname = AffectStandardName

            icd_set.append(UnMatchAffect)
            icd_setdata = utils.data_set(icd_set)
    
    icd_sql = (
        "replace into mdr_icdbusiness(BianMa,ProvinceName,District,County,ReportUnitName,ReportUnitAddress,ReportUnitTel,Postalcode,UnitType ,HappenDate,KnowDate,ReportDate,AcceptDate,StateReportDate,State,IsMatchingAffect,AffectStandardName,icd_a_name,icd_b_name,icd_c_name,PathName,UnMatchAffect)"
        "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    )
    
    icd_data = [
        data["BianMa"], data["ProvinceName"], data["District"], data["County"],
        data["ReportUnitName"], data["ReportUnitAddress"], data["ReportUnitTel"], data["Postalcode"], 
        data["UnitType"], data["HappenDate"], data["KnowDate"], data["ReportDate"], 
        data["ReportDate"], data["StateReportDate"], data["State"],IsMatchingAffect, 
        AffectStandardName, icd_a_name, icd_b_name, AffectStandardName, 
        PathName, UnMatchAffect]
    mdrsql.mdr_insert_alone(icd_sql, icd_data)
    
    return (IsMatchingAffect, UnMatchAffect,icd_setdata)
示例#8
0
def train(nvdm, train_url, optimizer, batch_size=64, training_epochs=1000):
    train_set, train_count = utils.data_set(train_url)
    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set), batch_size)
        loss_sum = 0.0
        for idx_batch in train_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                train_set, train_count, idx_batch, 2000)
            data_batch = torch.FloatTensor(data_batch)
            mask = torch.FloatTensor(mask)
            loss = nvdm(data_batch, mask)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_sum += loss.item()

        print(loss_sum / len(train_batches))
示例#9
0
文件: nvdm.py 项目: shshnk94/nvdm
def main(argv=None):

    if FLAGS.non_linearity == 'tanh':
      non_linearity = tf.nn.tanh
    elif FLAGS.non_linearity == 'sigmoid':
      non_linearity = tf.nn.sigmoid
    else:
      non_linearity = tf.nn.relu
    

    nvdm = NVDM(vocab_size=FLAGS.vocab_size,
                n_hidden=FLAGS.n_hidden,
                n_topic=FLAGS.n_topic, 
                n_sample=FLAGS.n_sample,
                learning_rate=FLAGS.learning_rate, 
                batch_size=FLAGS.batch_size,
                non_linearity=non_linearity)
    
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    sess = tf.Session(config=config)

    init = tf.initialize_all_variables()
    sess.run(init)

    train_url = os.path.join(FLAGS.data_dir, 'train.feat')
    
    if not FLAGS.test:
      train(sess, nvdm, train_url, FLAGS.batch_size, FLAGS.epochs)
    
    else:
      #Test

      saver = tf.train.Saver()
      saver.restore(sess, os.path.join(ckpt, 'model.ckpt'))
      print("Model restored.")
      
      #Training data
      train_set, train_count = utils.data_set(train_url)
      evaluate(nvdm, train_set, train_count, sess, 'test') 
示例#10
0
    net['pool4'] = PoolLayer(net['conv4_2'], pool_size=2, stride=2)

    net['fc3'] = DenseLayer(net['pool4'], num_units=256)
    net['fc4'] = DenseLayer(net['fc3'], num_units=30)
    net['prob'] = NonlinearityLayer(net['fc4'], nonlinearity=identity)
    return net


if __name__ == "__main__":
    # path to train and testing data
    PATH_train = "../data/training.csv"
    PATH_test = "../data/test.csv"
    # load data
    print 'loading data'
    data = data_set(path_train=PATH_train, path_test=PATH_test)
    #  drop the missing values
    print 'drop missing values'
    data.drop_missing_values()
    # center data VGG style
    print 'center alexnet'
    data.center_alexnet()
    # generate test validation split
    train_set_x, valid_set_x, train_set_y, valid_set_y = train_test_split(
        data.X, data.y, test_size=0.2, random_state=42)
    # change type and load to GPU
    print 'load data to gpu'
    train_set_x = train_set_x.reshape(-1, 1, 96,
                                      96).astype(theano.config.floatX)
    valid_set_x = valid_set_x.reshape(-1, 1, 96,
                                      96).astype(theano.config.floatX)
示例#11
0
def train(sess,
          model,
          train_url,
          test_url,
          dev_url,
          model_url,
          batch_size,
          saver,
          training_epochs=400,
          alternate_epochs=1):
    """train nvctm model."""
    train_set, train_count = utils.data_set(train_url)
    dev_set, dev_count = utils.data_set(dev_url)
    test_set, test_count = utils.data_set(test_url)

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)

    train_theta = []
    train_beta = []
    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        # -------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optim = model.optim_dec
                print_mode = 'updating decoder'
            else:
                optim = model.optim_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                res_sum = 0
                log_sum = 0
                mean_sum = 0
                var_sum = 0
                m = None
                Um = None
                enc = None

                for idx_batch in train_batches:
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)
                    input_feed = {
                        model.x.name: data_batch,
                        model.mask.name: mask
                    }
                    _, (loss, kld, mean, Umean, enc, rec_loss, log_s, mean_s,
                        vk_show, theta, beta, lp, v) = sess.run((optim, [
                            model.objective, model.kld, model.mean, model.U,
                            model.vk, model.recons_loss, model.log_squre,
                            model.mean_squre, model.vk_show, model.theta,
                            model.beta, model.log_prob, model.variance
                        ]), input_feed)
                    m = mean
                    Um = Umean
                    # print('*********************vk show', vk_show)
                    # print('Umean', Umean[0])
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    word_count += np.sum(count_batch)
                    res_sum += np.sum(rec_loss)
                    log_sum += np.sum(log_s)
                    mean_sum += np.sum(mean_s)
                    var_sum += np.sum(v) / np.sum(mask)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)

                    if epoch == training_epochs - 1 and switch == 1 and i == alternate_epochs - 1:
                        train_theta.extend(theta)
                        train_beta.extend(beta)

                print_ppx = np.exp(loss_sum / word_count)
                # print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print_res = res_sum / len(train_batches)
                print_log = log_sum / len(train_batches)
                print_mean = mean_sum / len(train_batches)
                print_var = var_sum / len(train_batches)

                print(
                    '| Epoch train: {:d} |'.format(epoch + 1),
                    print_mode,
                    '{:d}'.format(i),
                    '| Corpus ppx: {:.5f}'.format(
                        print_ppx),  # perplexity per word
                    # '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
                    '| KLD: {:.5}'.format(print_kld),
                    '| stddev {:.5}'.format(print_var),
                    '| res_loss: {:5}'.format(print_res),
                    '| log_loss: {:5}'.format(print_log),
                    '| mean_loss: {:5}'.format(print_mean))

                with codecs.open('./nvctm_train_theta', 'wb') as fp:
                    pickle.dump(np.array(train_theta), fp)
                fp.close()

                if (epoch + 1
                    ) % 50 == 0 and switch == 1 and i == alternate_epochs - 1:
                    with codecs.open('./nvctm_train_beta', 'wb') as fp:
                        pickle.dump(beta, fp)
                    fp.close()
                    npmi.print_coherence('nvctm',
                                         FLAGS.data_dir + '/train.feat',
                                         FLAGS.vocab_size)

        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        var_sum = 0
        word_count = 0
        doc_count = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask}
            loss, kld, v = sess.run(
                [model.objective, model.kld, model.variance], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            var_sum += np.sum(v) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_var = var_sum / len(train_batches)
        # print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)
        print('\n| Epoch dev: {:d}'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| stddev {:.5}'.format(print_var),
              '| KLD: {:.5}'.format(print_kld))

        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            var_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask}
                loss, kld, v = sess.run(
                    [model.objective, model.kld, model.variance], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                var_sum += np.sum(v) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_var = var_sum / len(train_batches)
            # print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            print('| Epoch test: {:d}'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| stddev {:.5}'.format(print_var),
                  '| KLD: {:.5}\n'.format(print_kld))
    npmi.print_coherence('nvctm', FLAGS.data_dir + '/train.feat',
                         FLAGS.vocab_size)
    saver.save(sess, model_url)
示例#12
0
#                    
#    temp = ConvLayer(net, num_filters=n_f, filter_size=3, stride=1, pad=1, nonlinearity=identity, flip_filters=False )
#    temp = ConvLayer(temp, num_filters=n_f, filter_size=1, stride=1, pad=0, nonlinearity=identity, flip_filters=False )
#    
#    
#    return net
                
if __name__ == "__main__":

    # path to train and testing data
    PATH_train = "../data/training.csv"
    PATH_test = "../data/test.csv"

    # load data
    print 'loading data \n'
    data = data_set(path_train=PATH_train, path_test=PATH_test)

    print 'sobel stacking image'
    data.stack_origi_sobel()

    # augmentation
    # data.augment()

    # center data
    # print 'center alexnet \n'
    # data.center_alexnet()
    # print 'center Xs VGG Style, X doesnt have missing values \n'
    # data.center_VGG()

    # generate test validation split
    data.split_trainval()
示例#13
0
文件: nvdm.py 项目: shshnk94/nvdm
def train(sess, model, train_url, batch_size, training_epochs=1000, alternate_epochs=10):

  train_set, train_count = utils.data_set(train_url)

  summaries = None#get_summaries(sess) 
  writer = None#tf.summary.FileWriter(ckpt + '/logs/', sess.graph)
  saver = tf.train.Saver()

  sess.graph.finalize()
 
  total_mem = 0
  mem = 0
 
  for epoch in range(training_epochs):

    train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True)

    for switch in range(0, 2):

      if switch == 0:
        optim = model.optim_dec
        print_mode = 'updating decoder'
      else:
        optim = model.optim_enc
        print_mode = 'updating encoder'

      for i in range(alternate_epochs):

        loss_sum = 0.0
        ppx_sum = 0.0
        kld_sum = 0.0
        word_count = 0
        doc_count = 0

        for idx_batch in train_batches:

          data_batch, count_batch, mask = utils.fetch_data(train_set, train_count, idx_batch, FLAGS.vocab_size)
          input_feed = {model.x.name: data_batch, model.mask.name: mask}
          _, (loss, kld) = sess.run((optim, [model.objective, model.kld]), input_feed)

          #loss, kld = tf.cast(loss, tf.float64), tf.cast(kld, tf.float64)
          loss_sum += np.sum(loss)
          kld_sum += np.sum(kld) / np.sum(mask)  
          word_count += np.sum(count_batch)
          # to avoid nan error
          count_batch = np.add(count_batch, 1e-12)
          # per document loss
          ppx_sum += np.sum(np.divide(loss, count_batch)) 
          doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum/len(train_batches)
        print('| Epoch train: {:d} |'.format(epoch+1), 
               print_mode, '{:d}'.format(i),
               '| Corpus ppx: {:.5f}'.format(print_ppx),  # perplexity for all docs
               '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
               '| KLD: {:.5}'.format(print_kld))
        
    evaluate(model, train_set, train_count, sess, 'val', (loss_sum + kld_sum), epoch, summaries, writer, saver)

    current_mem = process.memory_info().rss / (1024 ** 2)
    total_mem += (current_mem - mem)
    print("Memory increase: {}, Cumulative memory: {}, and current {} in MB".format(current_mem - mem, total_mem, current_mem))
    mem = current_mem
    gc.collect()
示例#14
0
def train_cnn():
    import time

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    BS = 64
    LR = 0.001
    EPOCH = 30
    data = data_set(*load_data())
    boundary = 200000
    train_set = DataLoader(TensorDataset(*data[:boundary]), BS, shuffle=True)
    eval_set = DataLoader(TensorDataset(*data[boundary:]),
                          2 * BS,
                          shuffle=False)
    cnn = CNN().to(device=device)
    loss_fn = nn.BCELoss()
    optim = torch.optim.Adam(cnn.parameters(), lr=LR)
    begin = time.time()
    ACC_train, ACC = [], []
    for epoch in range(EPOCH):
        cnn.train()
        acc_train = 0
        for i, (a, b, y, *_) in enumerate(train_set):
            a.squeeze_(-1)
            b.squeeze_(-1)
            a.unsqueeze_(1)
            b.unsqueeze_(1)
            x = torch.cat((a, b), dim=1).to(dtype=torch.float32, device=device)
            y = y.to(dtype=torch.float32, device=device)

            optim.zero_grad()
            y_pre = cnn(x)
            loss = loss_fn(y_pre, y)
            loss.backward()
            optim.step()
            y_pre[y_pre > 0.5] = 1
            y_pre[y_pre <= 0.5] = 0
            acc_train += len(torch.nonzero(y == y_pre))

            if (i + 1) % 500 == 0 or (i + 1) == len(train_set):
                time_cost = int(time.time() - begin)
                print('Time cost so far: {}h {}min {}s'.format(
                    time_cost // 3600, time_cost % 3600 // 60,
                    time_cost % 3600 % 60 // 1))
                print("Epoch[{}/{}], Step [{}/{}], Loss {:.4f}".format(
                    epoch + 1, EPOCH, i + 1, len(train_set), loss.item()))
        acc_train /= len(train_set.dataset)
        ACC_train.append(acc_train)

        cnn.eval()
        with torch.no_grad():
            eval_loss, acc = 0, 0
            for i, (a, b, y, *_) in enumerate(eval_set):
                a.squeeze_(-1)
                b.squeeze_(-1)
                a.unsqueeze_(1)
                b.unsqueeze_(1)
                x = torch.cat((a, b), dim=1).to(dtype=torch.float32,
                                                device=device)
                y = y.to(dtype=torch.float32, device=device)

                y_pre = cnn(x)
                eval_loss += loss_fn(y_pre, y)
                y_pre[y_pre > 0.5] = 1
                y_pre[y_pre <= 0.5] = 0
                acc += len(torch.nonzero(y == y_pre))
            eval_loss /= len(eval_set)
            acc /= len(eval_set.dataset)
            ACC.append(acc)
            time_cost = int(time.time() - begin)
            print('\nTime cost so far: {}h {}min {}s'.format(
                time_cost // 3600, time_cost % 3600 // 60,
                time_cost % 3600 % 60 // 1))
            print('Evaluation set: loss: {:.4f}, acc: {:.4f}\n'.format(
                eval_loss, acc))
            if acc > 0.8:
                cnn.predict(device=device)
    return cnn, ACC_train, ACC
def train(sess, model, 
          train_url, 
          test_url, 
          batch_size, 
          vocab_size,
          training_epochs=200, 
          alternate_epochs=1,#10
          lexicon=[],
          result_file='test.txt',
          B=1,
          warm_up_period=100):
  """train nvdm model."""
  train_set, train_count = utils.data_set(train_url)
  test_set, test_count = utils.data_set(test_url)
  # hold-out development dataset
  train_size=len(train_set)
  validation_size=int(train_size*0.1)
  dev_set = train_set[:validation_size]
  dev_count = train_count[:validation_size]
  train_set = train_set[validation_size:]
  train_count = train_count[validation_size:]
  print('sizes',train_size,validation_size,len(dev_set),len(train_set))
  optimize_jointly = True
  dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
  test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False)

  warm_up = 0
  start_min_alpha = 0.00001
  min_alpha = start_min_alpha
  warm_up_alpha=False
  start_B=4
  curr_B=B
  
  #for early stopping
  best_print_ana_ppx=1e10
  early_stopping_iters=30
  no_improvement_iters=0
  stopped=False
  epoch=-1
  #for epoch in range(training_epochs):
  while not stopped:
    epoch+=1
    train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True)
    if warm_up<1.:
      warm_up += 1./warm_up_period
    else:
      warm_up=1.
   
    # train
    #for switch in range(0, 2):
    if optimize_jointly:
      optim = model.optim_all
      print_mode = 'updating encoder and decoder'
    elif switch == 0:
      optim = model.optim_dec
      print_mode = 'updating decoder'
    else:
      optim = model.optim_enc
      print_mode = 'updating encoder'
    for i in range(alternate_epochs):
      loss_sum = 0.0
      ana_loss_sum = 0.0
      ppx_sum = 0.0
      kld_sum = 0.0
      ana_kld_sum = 0.0
      word_count = 0
      doc_count = 0
      recon_sum=0.0
      for idx_batch in train_batches:
        data_batch, count_batch, mask = utils.fetch_data(
        train_set, train_count, idx_batch, vocab_size)
        input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 0.75,model.warm_up.name: warm_up,model.min_alpha.name:min_alpha,model.B.name: curr_B}
        _, (loss,recon, kld,ana_loss,ana_kld) = sess.run((optim, 
                                    [model.true_objective, model.recons_loss, model.kld,model.analytical_objective,model.analytical_kld]),
                                    input_feed)
        loss_sum += np.sum(loss)
        ana_loss_sum += np.sum(ana_loss)
        kld_sum += np.sum(kld) / np.sum(mask) 
        ana_kld_sum += np.sum(ana_kld) / np.sum(mask)
        word_count += np.sum(count_batch)
        # to avoid nan error
        count_batch = np.add(count_batch, 1e-12)
        # per document loss
        ppx_sum += np.sum(np.divide(loss, count_batch)) 
        doc_count += np.sum(mask)
        recon_sum+=np.sum(recon)
      print_loss = recon_sum/len(train_batches)
      dec_vars = utils.variable_parser(tf.trainable_variables(), 'decoder')
      phi = dec_vars[0]
      phi = sess.run(phi)
      utils.print_top_words(phi, lexicon,result_file=None)
      print_ppx = np.exp(loss_sum / word_count)
      print_ana_ppx = np.exp(ana_loss_sum / word_count)
      print_ppx_perdoc = np.exp(ppx_sum / doc_count)
      print_kld = kld_sum/len(train_batches)
      print_ana_kld = ana_kld_sum/len(train_batches)
      

      print('| Epoch train: {:d} |'.format(epoch+1), 
               print_mode, '{:d}'.format(i),
               '| Corpus ppx: {:.5f}'.format(print_ppx),  # perplexity for all docs
               '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
               '| KLD: {:.5}'.format(print_kld),
               '| Loss: {:.5}'.format(print_loss),
               '| ppx anal.: {:.5f}'.format(print_ana_ppx),
               '|KLD anal.: {:.5f}'.format(print_ana_kld))
    if warm_up_alpha:
      if min_alpha>0.0001:
        min_alpha-=(start_min_alpha-0.0001)/training_epochs
    #-------------------------------
    # dev
    loss_sum = 0.0
    kld_sum = 0.0
    ppx_sum = 0.0
    word_count = 0
    doc_count = 0
    recon_sum=0.0
    print_ana_ppx = 0.0
    ana_loss_sum = 0.0
    for idx_batch in dev_batches:
      data_batch, count_batch, mask = utils.fetch_data(
          dev_set, dev_count, idx_batch, vocab_size)
      input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B}#,model.B.name: B
      loss,recon, kld,ana_loss = sess.run([model.objective, model.recons_loss, model.analytical_kld,model.analytical_objective],
                           input_feed)
      loss_sum += np.sum(loss)
      ana_loss_sum += np.sum(ana_loss)
      kld_sum += np.sum(kld) / np.sum(mask)  
      word_count += np.sum(count_batch)
      count_batch = np.add(count_batch, 1e-12)
      ppx_sum += np.sum(np.divide(loss, count_batch))
      doc_count += np.sum(mask) 
      recon_sum+=np.sum(recon)
    print_ana_ppx = np.exp(ana_loss_sum / word_count)
    print_ppx = np.exp(loss_sum / word_count)
    print_ppx_perdoc = np.exp(ppx_sum / doc_count)
    print_kld = kld_sum/len(dev_batches)
    print_loss = recon_sum/len(dev_batches)
    if print_ana_ppx<best_print_ana_ppx:
      no_improvement_iters=0
      best_print_ana_ppx=print_ana_ppx
      #check on validation set, if ppx better-> save improved model
      tf.train.Saver().save(sess, 'models/improved_model_bernoulli') 
    else:
      no_improvement_iters+=1
      print('no_improvement_iters',no_improvement_iters,'best ppx',best_print_ana_ppx)
      if no_improvement_iters>=early_stopping_iters:
          #if model has not improved for 30 iterations, stop training
          ###########STOP TRAINING############
          stopped=True
          print('stop training after',epoch,'iterations,no_improvement_iters',no_improvement_iters)
          ###########LOAD BEST MODEL##########
          print('load stored model')
          tf.train.Saver().restore(sess,'models/improved_model_bernoulli')
          
    print('| Epoch dev: {:d} |'.format(epoch+1), 
           '| Perplexity: {:.9f}'.format(print_ppx),
           '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
           '| KLD: {:.5}'.format(print_kld)  ,
           '| Loss: {:.5}'.format(print_loss))  
    #-------------------------------
    # test
    #if epoch%10==0 or epoch==training_epochs-1:
    if FLAGS.test:
      #if epoch==training_epochs-1:
      if stopped:
        #only do it once in the end
        coherence=utils.topic_coherence(test_set,phi, lexicon)
        print('topic coherence',str(coherence))
      loss_sum = 0.0
      kld_sum = 0.0
      ppx_sum = 0.0
      word_count = 0
      doc_count = 0
      recon_sum = 0.0
      ana_loss_sum = 0.0
      ana_kld_sum = 0.0
      for idx_batch in test_batches:
        data_batch, count_batch, mask = utils.fetch_data(
          test_set, test_count, idx_batch, vocab_size)
        input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B}
        loss, recon,kld,ana_loss,ana_kld = sess.run([model.objective, model.recons_loss,model.kld,model.analytical_objective,model.analytical_kld],
                             input_feed)
        loss_sum += np.sum(loss)
        kld_sum += np.sum(kld)/np.sum(mask) 
        ana_loss_sum += np.sum(ana_loss)
        ana_kld_sum += np.sum(ana_kld) / np.sum(mask)
        word_count += np.sum(count_batch)
        count_batch = np.add(count_batch, 1e-12)
        ppx_sum += np.sum(np.divide(loss, count_batch))
        doc_count += np.sum(mask) 
        recon_sum+=np.sum(recon)
      print_loss = recon_sum/len(test_batches)
      print_ppx = np.exp(loss_sum / word_count)
      print_ppx_perdoc = np.exp(ppx_sum / doc_count)
      print_kld = kld_sum/len(test_batches)
      print_ana_ppx = np.exp(ana_loss_sum / word_count)
      print_ana_kld = ana_kld_sum/len(train_batches)
      print('| Epoch test: {:d} |'.format(epoch+1), 
             '| Perplexity: {:.9f}'.format(print_ppx),
             '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
             '| KLD: {:.5}'.format(print_kld),
             '| Loss: {:.5}'.format(print_loss),
             '| ppx anal.: {:.5f}'.format(print_ana_ppx),
               '|KLD anal.: {:.5f}'.format(print_ana_kld)) 
示例#16
0
def train(sess,
          model,
          train_url,
          test_url,
          batch_size,
          FLAGS,
          train_csv_filename,
          dev_csv_filename,
          test_csv_filename,
          training_epochs=1000,
          alternate_epochs=10,
          is_restore=False):
    """train nvdm model."""
    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    # hold-out development dataset
    dev_set = test_set[:50]
    dev_count = test_count[:50]

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)
    #save model
    saver = tf.train.Saver()

    if is_restore:
        saver.restore(sess, "./checkpoints/model.ckpt")

    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        #-------------------------------
        # train
        for switch in xrange(0, 2):
            if switch == 0:
                optim = model.optim_dec
                print_mode = 'updating decoder'
            else:
                optim = model.optim_enc
                print_mode = 'updating encoder'
            for i in xrange(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                for idx_batch in train_batches:
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)
                    input_feed = {
                        model.x.name: data_batch,
                        model.mask.name: mask
                    }
                    _, (loss, kld) = sess.run(
                        (optim, [model.objective, model.kld]), input_feed)
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    word_count += np.sum(count_batch)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)
                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)

                with open(train_csv_filename, 'a') as train_csv:
                    train_writer = csv.writer(train_csv,
                                              delimiter=',',
                                              quotechar='|',
                                              quoting=csv.QUOTE_MINIMAL)
                    train_writer.writerow([
                        epoch + 1, print_mode, i, print_ppx, print_ppx_perdoc,
                        print_kld
                    ])

                print(
                    '| Epoch train: {:d} |'.format(epoch + 1),
                    print_mode,
                    '{:d}'.format(i),
                    '| Corpus ppx: {:.5f}'.format(
                        print_ppx),  # perplexity for all docs
                    '| Per doc ppx: {:.5f}'.format(
                        print_ppx_perdoc),  # perplexity for per doc
                    '| KLD: {:.5}'.format(print_kld))
        #-------------------------------
        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        word_count = 0
        doc_count = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask}
            loss, kld = sess.run([model.objective, model.kld], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)

        with open(dev_csv_filename, 'a') as dev_csv:
            dev_writer = csv.writer(dev_csv,
                                    delimiter=',',
                                    quotechar='|',
                                    quoting=csv.QUOTE_MINIMAL)
            dev_writer.writerow(
                [epoch + 1, print_ppx, print_ppx_perdoc, print_kld])

        print('| Epoch dev: {:d} |'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
              '| KLD: {:.5}'.format(print_kld))
        #-------------------------------
        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask}
                loss, kld = sess.run([model.objective, model.kld], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)

            with open(test_csv_filename, 'a') as test_csv:
                test_writer = csv.writer(test_csv,
                                         delimiter=',',
                                         quotechar='|',
                                         quoting=csv.QUOTE_MINIMAL)
                test_writer.writerow(
                    [epoch + 1, print_ppx, print_ppx_perdoc, print_kld])

            print('| Epoch test: {:d} |'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
                  '| KLD: {:.5}'.format(print_kld))
示例#17
0
def train(sess,
          model,
          train_url,
          test_url,
          batch_size,
          training_epochs=1000,
          alternate_epochs=10):
    """train gsm model."""
    # train_set: 维度为1 x vocab_size,每一维是对应的词出现次数, train_count: 训练集的总词数
    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    # hold-out development dataset, 选取前50篇文档
    dev_set = test_set[:50]
    dev_count = test_count[:50]

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)

    for epoch in range(training_epochs):
        # 创建batches,大小为batch_size
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        # -------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optimize = model.optimize_dec
                print_mode = 'updating decoder'
            elif switch == 1:
                optimize = model.optimize_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                # 训练每个batch
                for idx_batch in train_batches:
                    '''
                    data_batch: 当前batch的词频向量集合,batch_size*vocab_size
                    count_batch: 当前batch中每篇文档的词数
                    train_set: 训练集
                    train_count: 训练集词数
                    idx_batch: 当前batch
                    mask: 用于某个batch文档不足时做序列对齐
                    '''
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)
                    # input: x = data_batch, mask = mask
                    input_feed = {
                        model.x.name: data_batch,
                        model.mask.name: mask
                    }
                    # return: loss = objective, kld = kld, optimizer = optimize
                    # 以上三者组成feed_dict, 将模型中的tensor映射到具体的值
                    _, (loss, kld) = sess.run(
                        (optimize, [model.objective, model.kld]), input_feed)
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    # 总词数
                    word_count += np.sum(count_batch)
                    # to avoid nan error, 避免0分母
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)
                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print(
                    '| Epoch train: {:d} |'.format(epoch + 1),
                    print_mode,
                    '{:d}'.format(i + 1),
                    '| Corpus ppx: {:.5f}'.format(
                        print_ppx),  # perplexity for all docs
                    '| Per doc ppx: {:.5f}'.format(
                        print_ppx_perdoc),  # perplexity for per doc
                    '| KLD: {:.5}'.format(print_kld))
        # -------------------------------
        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        word_count = 0
        doc_count = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask}
            loss, kld = sess.run([model.objective, model.kld], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)
        print('| Epoch dev: {:d} |'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
              '| KLD: {:.5}'.format(print_kld))
        # -------------------------------
        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask}
                loss, kld = sess.run([model.objective, model.kld], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            print('| Epoch test: {:d} |'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
                  '| KLD: {:.5}'.format(print_kld))
示例#18
0
def read_input_data_tweets(data_dir, path_x, path_y):
    data_url = os.path.join(data_dir, path_x)
    data_url_y = os.path.join(data_dir, path_y)
    data_set_y = utils.data_set_y(data_url_y)
    data_set, data_count = utils.data_set(data_url)
    return data_set_y, data_set, data_count
示例#19
0
def serve():
    train_test_url = opj(args.data_dir, 'train_test.feat')
    entity_map_url = opj(args.data_dir, 'entity.map')
    feat_map_url = opj(args.data_dir, 'vocab.new')
    entity_sent_url = opj(args.data_dir, 'entities.sentences')
    guid2name = {}
    guid2id = {}
    id2guid = {}
    guid2sent = {}
    # The train_test.feat file contains some entities such as number 1997
    # that has no features. Its feature line is blank.
    # These entities were removed while training the neural network architecture.
    # Therefore to map the embeddings in NVGE back to the KB we need to use this
    # alignment information. This information is not necessary for BS because BS
    # can easily handle the fact that some entities have no features (ie. the )
    # document is empty.
    data_set, data_count, alignment = utils.data_set(train_test_url)
    for idx, row in enumerate(
            codecs.open(entity_map_url, 'r', 'utf-8').read().split('\n')):
        if row == '': continue
        dbid, canonical = row.split('\t')
        guid2name[dbid] = canonical
        if idx in alignment:
            guid2id[dbid] = alignment[idx]
            id2guid[alignment[idx]] = dbid

    GUID2SENT_PKL_FILE = opj(args.data_dir, os.path.pardir, 'guid2sent.pkl')
    try:
        print 'Loading', GUID2SENT_PKL_FILE
        guid2sent = pkl.load(open(GUID2SENT_PKL_FILE))
    except:
        print 'Could not find', GUID2SENT_PKL_FILE
        concrete_entity_files = os.listdir(args.concrete_entity_dir)
        for commidx, filename in enumerate(concrete_entity_files):
            print '%-5d\r' % ((commidx * 100) / len(concrete_entity_files)),
            comm = read_communication_from_file(
                opj(args.concrete_entity_dir, filename))
            guid = comm.id
            for sent in comm.sectionList[0].sentenceList:
                uuid = sent.uuid.uuidString
                tokens = [
                    e.text for e in sent.tokenization.tokenList.tokenList
                ]
                try:
                    guid2sent[guid].append((uuid, tokens))
                except KeyError:
                    guid2sent[guid] = [(uuid, tokens)]
        with open(GUID2SENT_PKL_FILE, 'wb') as gpf:
            print 'Dumping', GUID2SENT_PKL_FILE
            pkl.dump(guid2sent, gpf)

    # for row in codecs.open(entity_sent_url, 'r', 'utf-8').read().split('\n'):
    #     row = row.split(' ||| ')
    #     guid = row[0]
    #     for sent in row[1:]:
    #         tokens = sent.split()
    #         try:
    #             guid2sent[guid].append(tokens)
    #         except KeyError:
    #             guid2sent[guid] = [tokens]
    id2feat_data = codecs.open(feat_map_url, 'r', 'utf-8').read().split('\n')
    id2feat = dict((((sum(1 for e in id2feat_data if e != '') -
                      1) if idx == 0 else (idx - 1)), row.split()[0])
                   for idx, row in enumerate(id2feat_data) if row != '')
    print('Checking feature size =',
          len(data_set[guid2id[":Entity_ENG_EDL_0092354"]]), 'for',
          guid2name[":Entity_ENG_EDL_0092354"], 'max(id2feat.values())',
          max(id2feat.keys()))

    def load(args):
        import cPickle as pkl
        with open(opj(args.data_dir, args.model_pkl), 'rb') as f:
            nnp = pkl.load(f)
        return nnp

    handler = EntitySearchProvider(
        args.language,
        NVBS(data_set=data_set,
             nnp=load(args),
             method=getattr(NVBSALGO, args.algorithm),
             opts=args,
             id2guid=id2guid,
             guid2id=guid2id,
             guid2name=guid2name,
             guid2sent=guid2sent,
             id2feat=id2feat), args.k_query, args.k_rationale)
    server = SearchServiceWrapper(handler)
    if args.serve:
        print('Starting NVBS Server')
        server.serve(args.host, args.port)
    else:
        return handler.index
示例#20
0
def train(
    train_url,
    test_url,
    model_url,
    vocab_url,
    non_linearity,
    embedding_url,
    training_epochs,
    alternate_epochs,
    vocab_size,
    embedding_size,
    n_hidden,
    n_topic,
    n_sample,
    learning_rate,
    batch_size,
    is_training,
    mix_num,
):
    """train crntm model."""

    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    vocab = utils.get_vocab(vocab_url)
    embedding_table = utils.load_embedding(
        embedding_url, embedding_size, vocab,
        FLAGS.data_dir + '/vocab_embedding-{}.pkl'.format(embedding_size))

    # hold-out development dataset
    dev_count = test_count[:50]
    dev_onehot_set = test_set[:50]
    dev_batches = utils.create_batches(len(dev_onehot_set),
                                       batch_size,
                                       shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)

    # create model
    crntm = CRNTM(vocab_size=vocab_size,
                  embedding_size=embedding_size,
                  n_hidden=n_hidden,
                  n_topic=n_topic,
                  n_sample=n_sample,
                  learning_rate=learning_rate,
                  batch_size=batch_size,
                  non_linearity=non_linearity,
                  embedding_table=embedding_table,
                  is_training=is_training,
                  mix_num=mix_num)
    crntm.construct_model()

    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)
    model = crntm
    saver = tf.train.Saver()

    #
    # if RESTORE:
    #     return embedding_table[1:]

    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        #-------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optim = model.optim_dec
                print_mode = 'updating decoder'
            else:
                optim = model.optim_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                res_sum = 0
                log_sum = 0
                r_sum = 0
                log_s = None
                r_loss = None
                g_loss = None
                for bn, idx_batch in enumerate(train_batches):
                    data_onehot_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)

                    input_feed = {
                        model.x_onehot.name: data_onehot_batch,
                        model.mask.name: mask
                    }
                    _, (loss, kld, rec_loss, log_s, r_loss, g_loss) = sess.run(
                        (optim, [
                            model.objective, model.kld, model.recons_loss,
                            model.logits, model.doc_vec, model.topic_word_prob
                        ]), input_feed)

                    # if switch==0:
                    # #     # print(bn, len(train_batches), mask.sum(), r_loss.shape)
                    #     print('ptheta', log_s)
                    #     print('doc_Vec', r_loss)
                    #     print('topic_prob', g_loss)

                    res_sum += np.sum(rec_loss)
                    log_sum += np.sum(log_s)
                    loss_sum += np.sum(loss)
                    r_sum += np.sum(r_loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    word_count += np.sum(count_batch)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    # print(np.sum(np.divide(loss, count_batch)))
                    doc_count += np.sum(mask)
                    # if doc_count>11264:
                    #   print('debug:: ', doc_count, rec_loss, kld, loss[-1], count_batch[-1])
                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print_res = res_sum / len(train_batches)
                print_log = log_sum / len(train_batches)
                print_mean = r_sum / len(train_batches)
                message = '| Epoch train: {:d} | {} {:d} | Corpus ppx: {:.5f}::{} | Per doc ppx: {:.5f}::{} | KLD: {:.5} | res_loss: {:5} | log_loss: {:5} | r_loss: {:5}'.format(
                    epoch + 1,
                    print_mode,
                    i,
                    print_ppx,
                    word_count,
                    print_ppx_perdoc,
                    doc_count,
                    print_kld,
                    print_res,
                    print_log,
                    print_mean,
                )
                print(message)
                write_result(message)
        TopicWords(sess, vocab_url, embedding_table[1:])

        #-------------------------------
        # dev
        loss_sum = 0.0
        ppx_sum = 0.0
        kld_sum = 0.0
        word_count = 0
        doc_count = 0
        res_sum = 0
        log_sum = 0
        mean_sum = 0
        r_sum = 0
        for idx_batch in dev_batches:
            data_onehot_batch, count_batch, mask = utils.fetch_data(
                dev_onehot_set, dev_count, idx_batch, FLAGS.vocab_size)

            input_feed = {
                model.x_onehot.name: data_onehot_batch,
                model.mask.name: mask
            }
            loss, kld, rec_loss, log_s, r_loss = sess.run([
                model.objective, model.kld, model.recons_loss,
                model.embedding_loss, model.res_loss
            ], input_feed)

            res_sum += np.sum(rec_loss)
            log_sum += np.sum(log_s)
            loss_sum += np.sum(loss)
            r_sum += np.sum(r_loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            # to avoid nan error
            count_batch = np.add(count_batch, 1e-12)
            # per document loss
            ppx_sum += np.sum(np.divide(loss, count_batch))
            # print(np.sum(np.divide(loss, count_batch)))
            doc_count += np.sum(mask)
            # if doc_count>11264:
            #   print('debug:: ', doc_count, rec_loss, kld, loss[-1], count_batch[-1])
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        # print_ppx_perdoc = ppx_sum / doc_count
        # print(loss_sum, word_count)
        print_kld = kld_sum / len(train_batches)
        print_res = res_sum / len(train_batches)
        print_log = log_sum / len(train_batches)
        print_mean = r_sum / len(train_batches)
        message = '| Epoch dev: {:d} | Corpus ppx: {:.5f}::{} | Per doc ppx: {:.5f}::{} | KLD: {:.5} | res_loss: {:5} | log_loss: {:5} | r_loss: {:5}'.format(
            epoch + 1,
            print_ppx,
            word_count,
            print_ppx_perdoc,
            doc_count,
            print_kld,
            print_res,
            print_log,
            print_mean,
        )
        print(message)
        write_result(message)

        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_onehot_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {
                    model.x_onehot.name: data_onehot_batch,
                    model.mask.name: mask
                }
                loss, kld = sess.run([model.objective, model.kld], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            message = '| Epoch test: {:d} | Corpus ppx: {:.5f} | Per doc ppx: {:.5f} | KLD: {:.5} '.format(
                epoch + 1,
                print_ppx,
                print_ppx_perdoc,
                print_kld,
            )
            print(message)
            write_result(message)

    saver.save(sess, model_url)
示例#21
0
def train(sess,
          model,
          train_url,
          test_url,
          batch_size,
          training_epochs=1000,
          alternate_epochs=10):
    """train nvdm model."""
    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    # hold-out development dataset
    dev_set = test_set[:50]
    dev_count = test_count[:50]

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)

    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        #-------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optim = model.optim_dec
                print_mode = 'updating decoder'
            else:
                optim = model.optim_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                for idx_batch in train_batches:
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)
                    input_feed = {
                        model.x.name: data_batch,
                        model.mask.name: mask
                    }
                    _, (loss, kld) = sess.run(
                        (optim, [model.objective, model.kld]), input_feed)
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    word_count += np.sum(count_batch)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)
                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print(
                    '| Epoch train: {:d} |'.format(epoch + 1),
                    print_mode,
                    '{:d}'.format(i),
                    '| Corpus ppx: {:.5f}'.format(
                        print_ppx),  # perplexity for all docs
                    '| Per doc ppx: {:.5f}'.format(
                        print_ppx_perdoc),  # perplexity for per doc
                    '| KLD: {:.5}'.format(print_kld))
        #-------------------------------
        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        word_count = 0
        doc_count = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask}
            loss, kld = sess.run([model.objective, model.kld], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)
        print('| Epoch dev: {:d} |'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
              '| KLD: {:.5}'.format(print_kld))
        #-------------------------------
        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask}
                loss, kld = sess.run([model.objective, model.kld], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            print('| Epoch test: {:d} |'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
                  '| KLD: {:.5}'.format(print_kld))
示例#22
0
def train(sess, model, train_url, test_url, dev_url, batch_size, training_epochs=1000, alternate_epochs=1):
    """train gsm model."""
    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    dev_set, dev_count = utils.data_set(dev_url)

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False)

    kld_list = []
    var_list = []
    train_theta = []
    train_beta = []
    test_theta = []
    test_beta = []
    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True)
        # -------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optimize = model.optimize_dec
                print_mode = 'updating decoder'
            elif switch == 1:
                optimize = model.optimize_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                var_sum = 0
                for idx_batch in train_batches:
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)

                    input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: True, model.gamma.name: epoch/training_epochs}
                    _, (loss, kld, v, theta, beta) =\
                        sess.run((optimize, [model.reconstruction_loss, model.kld, model.variance, model.topic_dist, model.beta]), input_feed)
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    var_sum += np.sum(v) / np.sum(mask)
                    # print([np.max(theta[i]) for i in range(batch_size)])
                    # print([np.argmax(theta[i]) for i in range(batch_size)])
                    word_count += np.sum(count_batch)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)

                    if epoch == training_epochs - 1 and switch == 1 and i == alternate_epochs - 1:
                        train_theta.extend(theta)
                        train_beta.extend(beta)

                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print_var = var_sum / len(train_batches)
                kld_list.append(print_kld)
                var_list.append(print_var)
                print('| Epoch train: {:d}'.format(epoch + 1),
                      print_mode, '{:d}'.format(i + 1),
                      '| Corpus ppx: {:.5f}'.format(print_ppx),  # perplexity for all docs
                      '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
                      '| KLD: {:.5}'.format(print_kld),
                      '| stddev {:.5}'.format(print_var))

                with codecs.open('./gsm_train_theta', 'wb') as fp:
                    pickle.dump(np.array(train_theta), fp)
                fp.close()

                if (epoch + 1) % 50 == 0 and switch == 1 and i == alternate_epochs - 1:
                    with codecs.open('./gsm_train_beta', 'wb') as fp:
                        pickle.dump(beta, fp)
                    fp.close()
                    npmi.print_coherence('gsm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size)

        # -------------------------------
        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        word_count = 0
        doc_count = 0
        var_sum = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: False, model.gamma.name: 0}
            loss, kld, v = sess.run([model.objective, model.kld, model.variance], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            var_sum += np.sum(v) / np.sum(mask)
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)
        print_var = var_sum / len(train_batches)
        print('\n| Epoch dev: {:d}'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
              '| KLD: {:.5}'.format(print_kld),
              '| stddev: {:.5}'.format(print_var))

        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx, idx_batch in enumerate(test_batches):
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: False, model.gamma.name: 0}
                loss, kld, theta, beta, v = sess.run([model.objective, model.kld, model.topic_dist, model.beta, model.variance], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
                test_theta.extend(theta)
                if idx == len(test_batches) - 1:
                    test_beta.extend(beta)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            print('| Epoch test: {:d}'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
                  '| KLD: {:.5}'.format(print_kld),
                  '| stddev: {:.5}\n'.format(print_var))

    npmi.print_coherence('gsm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size)

    with codecs.open('./test_theta', 'wb') as fp:
        pickle.dump(test_theta, fp)
    fp.close()

    with codecs.open('./test_beta', 'wb') as fp:
        pickle.dump(test_beta, fp)
    fp.close()

    with codecs.open('./kld.txt', 'w', 'utf-8') as fp:
        for idx, kld in enumerate(kld_list):
            if idx < len(kld_list) - 1:
                fp.write(str(kld) + ', ')
            else:
                fp.write(str(kld))
        fp.close()
    with codecs.open('./var.txt', 'w', 'utf-8') as fp:
        for idx, var in enumerate(var_list):
            if idx < len(var_list) - 1:
                fp.write(str(var) + ', ')
            else:
                fp.write(str(var))
        fp.close()
示例#23
0
文件: main.py 项目: mxiny/NB-NTM
def main():
    # select model: NB or GNB
    if model == 'NBNTM':
        net = NBNTM.NBNTM(device, vocab_num, hidden_num, topic_num,
                          shape_prior, scale_prior)
    else:
        net = GNBNTM.GNBNTM(device, vocab_num, hidden_num, topic_num,
                            shape_prior, scale_prior)
    net = net.to(device)
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)

    # load data
    data_dir = 'data/' + data_name + '/'
    train_list, train_mat, train_count = utils.data_set(
        data_dir + 'train.feat', vocab_num)
    test_list, test_mat, test_count = utils.data_set(data_dir + 'test.feat',
                                                     vocab_num)

    # auxiliary dir setting
    if not os.path.exists('./result'):
        os.mkdir('./result')
        os.mkdir('./result/NBNTM')
        os.mkdir('./result/GNBNTM')
    if not os.path.exists('./checkpoint'):
        os.mkdir('./checkpoint')
        os.mkdir('./checkpoint/NBNTM')
        os.mkdir('./checkpoint/GNBNTM')

    flag_str = (data_name + '_shape_' + str(shape_prior) + '_scale_' +
                str(scale_prior) + '_K_' + str(topic_num) + '_V_' +
                str(vocab_num) + '_H_' + str(hidden_num) + '_batch_' +
                str(batch_size) + '_lr_' + str(learning_rate) + '_epoch_' +
                str(epochs))
    result_dir = './result/' + model + '/' + flag_str
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)

    # record in file
    train_ppl_time = []
    best_train_ppl = 1e12
    best_coherence = -1

    start_time = time.time()
    addition_time = 0
    for epoch in range(epochs):
        # train
        perplexity, kld = run(net, optimizer, train_list, train_count, True)
        current_time_cost = time.time() - start_time
        train_ppl_time.append([perplexity.detach().item(), current_time_cost])
        print_result(epoch, 'train', perplexity, kld)

        temp_time = time.time()
        # prepare for test
        if epoch % 10 == 9:
            if perplexity < best_train_ppl:
                best_train_ppl = perplexity
                # save model
                state = {
                    'net': net.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'epochs': epoch
                }
                torch.save(
                    state,
                    './checkpoint/' + model + '/' + flag_str + '_best_ppl')

            # coherence
            coherence = evaluate_coherence(net, train_mat, [5])
            print('train coherence = ', coherence)

            if coherence > best_coherence:
                best_coherence = coherence
                # save model
                state = {
                    'net': net.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'epochs': epoch
                }
                torch.save(
                    state, './checkpoint/' + model + '/' + flag_str +
                    '_best_coherence')
        addition_time += time.time() - temp_time

    end_time = time.time()
    print(f'time cost:{end_time - start_time - addition_time}')

    record_result(result_dir + './train_ppl_time_record', train_ppl_time)

    # test perplexity
    checkpoint = torch.load('./checkpoint/' + model + '/' + flag_str +
                            '_best_ppl',
                            map_location='cuda:0')
    net.load_state_dict(checkpoint['net'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    epoch = checkpoint['epochs']
    perplexity, kld = run(net, optimizer, test_list, test_count, False)
    print_result(epoch, 'test', perplexity, kld)

    # test coherence
    checkpoint = torch.load('./checkpoint/' + model + '/' + flag_str +
                            '_best_coherence',
                            map_location='cuda:0')
    net.load_state_dict(checkpoint['net'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    print(
        'whole coherence = ',
        evaluate_coherence(net, np.concatenate((train_mat, test_mat)),
                           [5, 10, 15]))

    # save topic words
    utils.print_topic_word('data/' + data_name + '/' + data_name + '.vocab',
                           model + '_topic_words.txt',
                           net.out_fc.weight.detach().cpu().t(), 15)
示例#24
0
文件: nvdm.py 项目: shshnk94/nvdm
def evaluate(model, training_data, training_count, session, step, train_loss=None, epoch=None, summaries=None, writer=None, saver=None):

  #Get theta for the H1.
  data_url = os.path.join(FLAGS.data_dir, 'valid_h1.feat' if step != 'test' else 'test_h1.feat')
  dataset, dataset_count = utils.data_set(data_url)
  data_batches = utils.create_batches(len(dataset), FLAGS.batch_size, shuffle=False)
   
  theta = []
  for idx_batch in data_batches:

    data_batch, count_batch, mask = utils.fetch_data(dataset, dataset_count, idx_batch, FLAGS.vocab_size)
    input_feed = {model.x.name: data_batch, model.mask.name: mask}

    logit_theta = session.run(model.doc_vec, input_feed)
    theta.append(softmax(logit_theta, axis=1)) 

  theta = np.concatenate(theta, axis=0)

  weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='decoder/projection/Matrix:0')[0].eval(session)
  bias = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='decoder/projection/Bias:0')[0].eval(session)
  beta = softmax(weights + bias, axis=1)

  #H2 to calculate perplexity.
  data_url = os.path.join(FLAGS.data_dir, 'valid_h2.feat' if step != 'test' else 'test_h2.feat')
  dataset, dataset_count = utils.data_set(data_url)
  data_batches = utils.create_batches(len(dataset), FLAGS.batch_size, shuffle=False)

  test_data = [utils.fetch_data(dataset, dataset_count, idx_batch, FLAGS.vocab_size)[0] for idx_batch in data_batches]
  test_data = np.concatenate(test_data, axis=0)

  perplexity = get_perplexity(test_data, theta, beta)
  coherence = get_topic_coherence(beta, training_data, 'nvdm') if  step == 'test' else np.nan
  diversity = get_topic_diversity(beta, 'nvdm') if step == 'test' else np.nan
    
  if step == 'val':

    #tloss = tf.get_default_graph().get_tensor_by_name('tloss:0') 
    #vppl = tf.get_default_graph().get_tensor_by_name('vppl:0') 

    #weight_summaries = session.run(summaries, feed_dict={tloss: train_loss, vppl: perplexity})
    #weight_summaries = summaries.eval(session=session)
    #writer.add_summary(weight_summaries, epoch)
    save_path = saver.save(session, os.path.join(ckpt, 'model.ckpt'))

    print("Model saved in path: %s" % ckpt)
    print('| Epoch dev: {:d} |'.format(epoch+1)) 

  else:
    
    ## get most used topics
    cnt = 0
    thetaWeightedAvg = np.zeros((1, FLAGS.n_topic))
    data_batches = utils.create_batches(len(training_data), FLAGS.batch_size, shuffle=False)

    for idx_batch in data_batches:

        batch, count_batch, mask = utils.fetch_data(training_data, training_count, idx_batch, FLAGS.vocab_size)
        sums = batch.sum(axis=1)
        cnt += sums.sum(axis=0)

        input_feed = {model.x.name: batch, model.mask.name: mask}
        logit_theta = session.run(model.doc_vec, input_feed)
        theta = softmax(logit_theta, axis=1)
        weighed_theta = (theta.T * sums).T
        thetaWeightedAvg += weighed_theta.sum(axis=0)

    thetaWeightedAvg = thetaWeightedAvg.squeeze() / cnt
    print('\nThe 10 most used topics are {}'.format(thetaWeightedAvg.argsort()[::-1][:10]))

    with open(FLAGS.data_dir + '/vocab.new', 'rb') as f:
      vocab = pkl.load(f)

    topic_indices = list(np.random.choice(FLAGS.n_topic, 10)) # 10 random topics
    print('\n')

    with open(ckpt + '/topics.txt', 'w') as f:
      for k in range(FLAGS.n_topic):
        gamma = beta[k]
        top_words = list(gamma.argsort()[-FLAGS.n_words+1:][::-1])
        topic_words = [vocab[a] for a in top_words]
        f.write(str(k) + ' ' + str(topic_words) + '\n')
        print('Topic {}: {}'.format(k, topic_words))

  with open(ckpt + '/' + step + '_scores.csv', 'a') as handle:
    handle.write(str(perplexity) + ',' + str(coherence) + ',' + str(diversity) + '\n')