示例#1
0
def train_amazon():
    num_walks = 5
    walk_length = 5
    window_size = 5
    neighbor_samples = 5
    hidden_uints = 32
    num_heads = [8]
    dropout = 0.5
    out_size = 200
    lr = 0.1
    weight_decay = 0.001
    epochs = 5

    g, features, train_pairs, neg_neighbors, vocab, edge_data_by_type = load_amazon(
        num_walks, neighbor_samples, window_size)
    features = torch.FloatTensor(features)
    print("特征的shape是:", features.shape)

    metapaths = []
    for edge_type in edge_data_by_type.keys():
        metapaths.append([edge_type] * walk_length)
    model = HAN(meta_paths=metapaths,
                in_size=features.shape[1],
                hidden_size=hidden_uints,
                out_size=out_size,
                num_heads=num_heads,
                dropout=dropout)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr,
                                 weight_decay=weight_decay)

    for epoch in range(epochs):
        model.train()
        _, h = model(g, features)
        loss = nce_loss(h, train_pairs, neg_neighbors)
        print(loss)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    _, h = model(g, features)
    embeddings = get_embeddings(h, list(vocab))
    evaluate_amazon(embeddings)
示例#2
0
def create_model(params):
    global i
    i += 1
    logger.info('Model' + str(i) + 'training')

    l2 = params['l2']
    dropout = params['dropout']
    lr = params['lr']

    han_model = HAN(
        MAX_WORDS_PER_SENT,
        MAX_SENT,
        1,
        embedding_matrix,  # 1 is output size
        word_encoding_dim,
        sentence_encoding_dim,
        l1,
        l2,
        dropout)

    han_model.summary()

    opt = Adam(lr=lr)

    #han_model.compile(optimizer=opt, loss='binary_crossentropy',
    #                  metrics=['acc', rec_scorer, f1_scorer, f2_scorer])
    han_model.compile(optimizer=opt,
                      loss='binary_crossentropy',
                      metrics=['acc'])

    # print(han_model.metrics_names)
    # es = EarlyStopping(monitor='val_loss', mode='min', patience = 5, verbose=1)
    # mc = ModelCheckpoint('best_HAN.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
    # up here 'best_HAN_'+str(i)+'.h5'
    #scores = 1, 2, 3, 4, 5
    #scores = pd.DataFrame(scores)
    han_model.fit(X_train,
                  y_train,
                  validation_split=0,
                  batch_size=8,
                  epochs=Nepochs)  # ,callbacks=[es, mc])
    '''
    scores = han_model.evaluate(X_val, y_val, verbose=0)

    #f2 = scores[4]
    f1 = scores[3]
    #rec = scores[2]
    accuracy = scores[1]
    loss = scores[0]
    '''

    ### metrics
    scores1 = han_model.evaluate(X_val, y_val, verbose=0)

    y_pred_num = han_model.predict(X_val)

    y_pred_bin = (y_pred_num > 0.5) * 1

    scores = classification_report(y_val, y_pred_bin, output_dict=True)
    scores = scores['weighted avg']
    scores['loss'] = scores1[0]
    scores['accuracy'] = scores1[1]

    df_scores = pd.DataFrame([scores])

    df_params = pd.DataFrame.from_dict([params])
    #print(df_params.__class__)

    df_new = df_scores.join(df_params)

    #print(df_new.__class__)

    #try:
    df_results = pd.read_csv(os.path.join(results_dir, out_file + 'all.csv'))
    df_results = df_results.append(df_new)
    df_results.to_csv(os.path.join(results_dir, out_file + 'all.csv'),
                      index=False)
    #except NameError:
    #    df_results=df_new
    #    df_results.to_csv(os.path.join(results_dir, out_file+'all.csv'))

    # Save the best model
    if f1 <= df_results['accuracy'].max():
        # han_model.save("han_model.hd5")
        print("Save model")
        han_model.save(os.path.join(results_dir, out_file + '_model.hd5'))

    return {'loss': loss, 'params': params, 'status': STATUS_OK}
def create_model(params):
    global i
    i += 1
    l2 = params['l2']
    l1 = params['l1']
    dropout = params['dropout']
    lr = params['lr']

    han_model = HAN(params['MAX_WORDS_PER_SENT'], params['MAX_SENT'], 1,
                    embedding_matrix, params['word_encoding_dim'],
                    params['sentence_encoding_dim'], l1, l2, dropout)
    han_model.summary()
    optimizer = optimizers.Adam(lr)
    han_model.compile(loss='mean_squared_error',
                      metrics=['accuracy'],
                      optimizer=optimizer)

    my_callbacks = [
        tf.keras.callbacks.ModelCheckpoint(
            filepath=results_dir + out_file +
            'model.{epoch:02d}-{val_loss:.2f}.h5')
    ]

    train_generator = DataGenerator(X_train,
                                    y_train,
                                    batch_size=params['Nbatches'])
    val_generator = DataGenerator(X_val, y_val, batch_size=params['Nbatches'])

    han_model.fit_generator(generator=train_generator,
                            validation_data=val_generator,
                            use_multiprocessing=True,
                            epochs=params["Nepochs"],
                            workers=6,
                            callbacks=my_callbacks)

    scores1 = han_model.evaluate(X_test, y_test, verbose=0)
    print(scores1)
    y_pred_num = han_model.predict(X_test)
    d = {'loss': [scores1[0]], 'acc': [scores1[1]]}
    df_scores = pd.DataFrame(data=d)
    print(df_scores)
    df_params = pd.DataFrame.from_dict([params])
    #print(df_params)
    df_new = df_params.join(df_scores)
    #print(df_new)
    df_results = pd.read_csv(results_dir + out_file + 'results_all2.csv')
    df_results = df_results.append(df_new)
    df_results.to_csv(results_dir + out_file + 'results_all2.csv', index=False)
    han_model.save(results_dir + out_file + str(i) + '.hd5')
    return {
        'loss': scores1[0],
        'acc': scores1[1],
        'params': params,
        'status': STATUS_OK
    }
示例#4
0
def create_model(params):
    global i
    i += 1
    logger.info('Model' + str(i) + 'training')

    l2 = params['l2']
    dropout = params['dropout']
    lr = params['lr']

    han_model = HAN(
        MAX_WORDS_PER_SENT,
        MAX_SENT,
        1,
        embedding_matrix,  # 1 is output size
        word_encoding_dim,
        sentence_encoding_dim,
        l1,
        l2,
        dropout)

    han_model.summary()

    opt = Adam(lr=lr)

    han_model.compile(optimizer=opt,
                      loss='binary_crossentropy',
                      metrics=['acc', rec_scorer, f1_scorer, f2_scorer])

    han_model.fit(X_train,
                  y_train,
                  validation_split=0,
                  batch_size=8,
                  epochs=Nepochs)  # ,callbacks=[es, mc])

    scores = han_model.evaluate(X_val, y_val, verbose=0)

    #f2 = scores[4]
    f1 = scores[3]
    #rec = scores[2]
    #accuracy = scores[1]
    loss = scores[0]

    # build and save results and parameters
    df_scores = pd.DataFrame([scores],
                             columns=('loss', 'accuracy', 'recall', 'f1',
                                      'f2'))

    df_params = pd.DataFrame.from_dict([params])

    df_new = df_scores.join(df_params)

    df_results = pd.read_csv(os.path.join(results_dir, out_file + 'all.csv'))
    df_results = df_results.append(df_new)
    df_results.to_csv(os.path.join(results_dir, out_file + 'all.csv'),
                      index=False)

    # Save the best model
    if f1 <= df_results['f1'].max():
        # han_model.save("han_model.hd5")
        print("Save model")
        han_model.save(os.path.join(results_dir, out_file + '_model.hd5'))

    return {'loss': loss, 'params': params, 'status': STATUS_OK}
示例#5
0
def main(args):
    # If args['hetero'] is True, g would be a heterogeneous graph.
    # Otherwise, it will be a list of homogeneous graphs.
    g, features, labels, num_classes, train_mask, val_mask, test_mask, node_list = load_dblp(
    )

    features = features.to(args['device'])
    labels = labels.to(args['device'])
    train_mask = train_mask.to(args['device'])
    val_mask = val_mask.to(args['device'])
    test_mask = test_mask.to(args['device'])

    print(features.shape)
    print("finish loading data")

    if args['hetero']:
        model = HAN(meta_paths=[['ap', 'pa'], ['ap', 'pc', 'cp', 'pa']],
                    in_size=features.shape[1],
                    hidden_size=args['hidden_units'],
                    out_size=num_classes,
                    num_heads=args['num_heads'],
                    dropout=args['dropout']).to(args['device'])
    else:
        model = HAN(num_meta_paths=len(g),
                    in_size=features.shape[1],
                    hidden_size=args['hidden_units'],
                    out_size=num_classes,
                    num_heads=args['num_heads'],
                    dropout=args['dropout']).to(args['device'])

    stopper = EarlyStopping(patience=args['patience'])
    loss_fcn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args['lr'],
                                 weight_decay=args['weight_decay'])

    for epoch in range(args['num_epochs']):
        model.train()
        _, logits = model(g, features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_acc, train_micro_f1, train_macro_f1 = score(
            logits[train_mask], labels[train_mask])
        val_loss, val_acc, val_micro_f1, val_macro_f1 = evaluate(
            model, g, features, labels, val_mask, loss_fcn)
        early_stop = stopper.step(val_loss.data.item(), val_acc, model)

        print(
            'Epoch {:d} | Train Loss {:.4f} | Train Micro f1 {:.4f} | Train Macro f1 {:.4f} | '
            'Val Loss {:.4f} | Val Micro f1 {:.4f} | Val Macro f1 {:.4f}'.
            format(epoch + 1, loss.item(), train_micro_f1, train_macro_f1,
                   val_loss.item(), val_micro_f1, val_macro_f1))

        if early_stop:
            break

    stopper.load_checkpoint(model)
    test_loss, test_acc, test_micro_f1, test_macro_f1 = evaluate(
        model, g, features, labels, test_mask, loss_fcn)
    print('Test loss {:.4f} | Test Micro f1 {:.4f} | Test Macro f1 {:.4f}'.
          format(test_loss.item(), test_micro_f1, test_macro_f1))

    h, _ = model(g, features)
    print(h.shape)

    embeddings = get_embeddings(h, node_list)
    labels = load_dblp_labels()
    X = list(labels.keys())
    Y = list(labels.values())

    for p in [0.2, 0.4, 0.6, 0.8]:
        evaluate_embeddings(p, embeddings, X, Y)
示例#6
0
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x #sentence x #word)...')
x_train = sequence.pad_sequences(x_train, maxlen=max_sents * max_sen_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_sents * max_sen_len)
x_train = x_train.reshape((len(x_train), max_sents, max_sen_len))
x_test = x_test.reshape((len(x_test), max_sents, max_sen_len))
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = HAN(max_sents, max_sen_len, max_features, embedding_dims).get_model()

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
model.fit(x_train,
          y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))
示例#7
0
con_hidden_num = 250
lr = 0.01
embed_y_size = 180
train = dataset.get_train()
dev = dataset.get_val()
test = dataset.get_test()
train_x = train[0]
train_y = train[1]
dev_x = dev[0]
dev_y = dev[1]
test_x = test[0]
test_y = test[1]

max_con_len = get_train_max_con_len(train_y)
max_sen_len = get_train_max_sen(train_x)
model = HAN(vocab_size, embed_size, sen_hidden_num, con_hidden_num, class_num,
            embed_y_size)
loss_function = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(),
                             lr=lr)  # optimize all cnn parameters

for epoch in range(20):
    for conv1_x, conv1_y in zip(train_x, train_y):
        conv1_sen_len = compute_sen_len_in_con(conv1_x)
        conv1_x = sequence.pad_sequences(conv1_x,
                                         maxlen=None,
                                         dtype='int32',
                                         padding='post',
                                         value=0)
        tag_scores = model(torch.from_numpy(conv1_x).long(), conv1_sen_len)
        conv1_y = torch.from_numpy(numpy.array(conv1_y)).long()
        loss = loss_function(tag_scores, conv1_y)