Пример #1
0
def interactiveGRUTraining(
    trainingDataFile=main_dir + 'facebook.splits/train.10/train_classmate_1',
    wordsEmbeddings=None,
    wordsEmbeddings_path=main_dir + 'facebook/nodesFeatures',
    typesEmbeddings=None,
    typesEmbeddings_path='',
    word_dimension=22,
    type_dimension=20,
    dimension=64,
    attention_dimension=12,
    wordsSize=1000000,
    subpaths_map=None,
    subpaths_file=main_dir + 'facebook/subpathsSaveFile',
    sequences_map=None,
    sequences_file='',
    maxlen_subpaths=1000,
    maxlen=100,  # Sequence longer then this get ignored 
    batch_size=1,
    is_shuffle_for_batch=False,
    alpha=0.1,
    beta=0.1,
    gamma=0.1,
    objective_function_method='hinge-loss',
    objective_function_param=0,
    lrate=0.0001,
    max_epochs=10,
    dispFreq=5,
    saveFreq=5,
    saveto=main_dir + 'facebook/path2vec-modelParams.npz',
    decay=0.01,
):
    model_options = locals().copy()

    if wordsEmbeddings is None:
        if wordsEmbeddings_path is not None:
            wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings(
                wordsEmbeddings_path)
        else:
            print 'Exit...'
            exit(0)
    if typesEmbeddings is None:
        if typesEmbeddings_path is not None:
            typesEmbeddings, type_dimension, wordsSize = dataProcessTools.getTypesEmbeddings(
                typesEmbeddings_path)
        else:
            print 'Exit...'
            exit(0)

    trainingData, trainingPairsData = dataProcessTools.getTrainingData(
        trainingDataFile)
    allBatches = dataProcessTools.get_minibatches_idx(len(trainingData),
                                                      batch_size,
                                                      is_shuffle_for_batch)

    sequences_data = dataProcessTools.readAllSequencesFromFile(sequences_file)

    params = init_sharedVariables(model_options)
    tparams = init_tparams(params)
    print 'Generate models ......'
    trainingParis, sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, wordsEmbs, typesEmbs, masks_matrix, groups_tensor, cost = interactiveGRULearningBatch.interactiveGRULearning(
        model_options, tparams)

    print 'Generate gradients ......'
    grads = tensor.grad(cost, wrt=list(tparams.values()))
    print 'Using Adadelta to generate functions ......'
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = adadelta(
        lr, tparams, grads, trainingParis, sequences_matrix, dependency_matrix,
        dependWeight_matrix, sequencesLen_vector, discountSeq_matrix,
        discountForEachNode_matrix, wordsEmbs, typesEmbs, masks_matrix,
        groups_tensor, cost)

    print 'Start training models ......'
    best_p = None
    history_cost = []

    start_time = time.time()
    print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                         time.localtime(time.time()))
    uidx = 0
    for eidx in range(max_epochs):
        for _, batch in allBatches:
            uidx += 1
            trainingDataForBatch = [trainingData[i] for i in batch]
            trainingPairsForBatch = [trainingPairsData[i] for i in batch]
            trainingParis_data, sequences_matrix_data, dependency_matrix_data, dependWeight_matrix_data, sequencesLen_vector_data, discountSeq_matrix_data, discountForEachNode_matrix_data, masks_matrix_data, groups_tensor_data = dataProcessTools.prepareDataForTrainingBatch(
                trainingDataForBatch, trainingPairsForBatch, sequences_data,
                alpha, beta, gamma)
            if len(trainingParis_data) == 0:
                continue
            cost = f_grad_shared(
                trainingParis_data, sequences_matrix_data,
                dependency_matrix_data, dependWeight_matrix_data,
                sequencesLen_vector_data, discountSeq_matrix_data,
                discountForEachNode_matrix_data, wordsEmbeddings,
                typesEmbeddings, masks_matrix_data, groups_tensor_data)
            f_update(lrate)
            if numpy.isnan(cost) or numpy.isinf(cost):
                print('bad cost detected: ', cost)
                return
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch =', eidx, ',  Update =', uidx, ',  Cost =', cost
            if saveto and numpy.mod(uidx, saveFreq) == 0:
                print 'Saving... time ==', time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_cost, **params)
                pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                print('Done')
    end_time = time.time()
    print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(end_time))
    print 'Training finished! Cost time == ', end_time - start_time, ' s'
Пример #2
0
def proxEmbedBySubgraphs(
    trainingDataFile=main_dir + 'train_classmate',
    wordsEmbeddings_data=None,
    wordsEmbeddings_path=main_dir + 'facebook/nodesFeatures',
    subpaths_map=None,
    subpaths_file=main_dir + 'facebook/subpathsSaveFile',
    subgraphSaveFile='',
    maxlen_subpaths=1000,
    wordsSize=1000000,
    maxlen=100,
    batch_size=1,
    is_shuffle_for_batch=False,
    dispFreq=5,
    saveFreq=5,
    saveto=main_dir + 'facebook/path2vec-modelParams.npz',
    lrate=0.0001,
    word_dimension=22,
    dimension=64,
    discount_alpha=0.3,
    discount_beta=0.3,
    h_output_method='max-pooling',
    objective_function_method='hinge-loss',
    objective_function_param=0,
    max_epochs=10,
    decay=0.01,
):
    model_options = locals().copy()

    if wordsEmbeddings_data is None:
        if wordsEmbeddings_path is not None:
            wordsEmbeddings_data, word_dimension, wordsSize = dataProcessTools.getWordsEmbeddings(
                wordsEmbeddings_path)
        else:
            exit(0)
    trainingData, trainingPairs_data = dataProcessTools.getTrainingData(
        trainingDataFile)
    allBatches = dataProcessTools.get_minibatches_idx(len(trainingData),
                                                      batch_size,
                                                      is_shuffle_for_batch)

    subgraphs = dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(
        subgraphSaveFile)

    params = init_sharedVariables(model_options)
    tparams = init_tparams(params)
    print 'Generate models ......'

    trainingPairs, sequences, masks, lengths, subgraph_lens, wordsEmbeddings, buffer_tensor, nodesLens, cost = proxEmbedBySubgraphModel.proxEmbedBySubgraphModel(
        model_options, tparams)

    print 'Generate gradients ......'
    grads = tensor.grad(cost, wrt=list(tparams.values()))
    print 'Using Adadelta to generate functions ......'
    this_time = time.time()
    print 'Start to compile and optimize, time ==', time.strftime(
        '%Y-%m-%d %H:%M:%S', time.localtime(this_time))
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = adadelta(lr, tparams, grads, trainingPairs,
                                       sequences, masks, lengths,
                                       subgraph_lens, wordsEmbeddings,
                                       buffer_tensor, nodesLens, cost)

    print 'Start training models ......'
    best_p = None
    history_cost = []

    start_time = time.time()
    print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                         time.localtime(start_time))
    uidx = 0
    for eidx in range(max_epochs):
        for _, batch in allBatches:
            uidx += 1
            trainingDataForBatch = [trainingData[i] for i in batch]
            trainingPairsForBatch = [trainingPairs_data[i] for i in batch]
            tuples3DMatrix_data, x_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data, nodesLens_data = dataProcessTools.generateSequenceAndMasksForSingleSequenceWithLength(
                trainingDataForBatch, trainingPairsForBatch, subgraphs,
                dimension)
            cost = f_grad_shared(tuples3DMatrix_data, x_data, mask_data,
                                 lens_data, subgraph_lens_data,
                                 wordsEmbeddings_data, buffer_tensor_data,
                                 nodesLens_data)
            f_update(lrate)

            if numpy.isnan(cost) or numpy.isinf(cost):
                print('bad cost detected: ', cost)
                return
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch =', eidx, ',  Update =', uidx, ',  Cost =', cost
                this_time = time.time()
                print 'Time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                               time.localtime(this_time))
            if saveto and numpy.mod(uidx, saveFreq) == 0:
                print('Saving...')
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_cost, **params)
                pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                print('Done')
        gc.collect()

    end_time = time.time()
    print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(end_time))
    print 'Training finished! Cost time == ', end_time - start_time, ' s'
Пример #3
0
def proxEmbedTraining(
    trainingDataFile=main_dir +
    'facebook.splits/train.10/train_classmate_1',  # the full path of training data file
    wordsEmbeddings=None,  # words embeddings
    wordsEmbeddings_path=main_dir +
    'facebook/nodesFeatures',  # the file path of words embeddings
    word_dimension=22,  # dimension of words embeddings
    dimension=64,  # the dimension of paths embeddings
    wordsSize=1000000,  # the size of words vocabulary
    subpaths_map=None,  # contains sub-paths
    subpaths_file=main_dir +
    'facebook/subpathsSaveFile',  # the file which contains sub-paths
    maxlen_subpaths=1000,  # the max length for sub-paths
    h_output_method='mean-pooling',  # the output way of lstm. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path.
    maxlen=100,  # Sequence longer then this get ignored 
    batch_size=1,  # use a batch for training. This is the size of this batch.
    is_shuffle_for_batch=False,  # if need shuffle for training
    discount_alpha=0.1,  # the parameter alpha for discount. The longer the subpath, the little will the weight be.
    subpaths_pooling_method='max-pooling',  # the ways to combine several subpaths to one. "mean-pooling" means to combine all subpaths to one by mean-pooling; "max-pooling" means to combine all subpaths to one by max-pooling.
    objective_function_method='hinge-loss',  # loss function, we use sigmoid
    objective_function_param=0,  # the parameter in loss function, beta
    lrate=0.0001,  # learning rate
    max_epochs=10,  # the max epochs for training
    dispFreq=5,  # the frequences for display
    saveFreq=5,  # the frequences for saving the parameters
    saveto=main_dir +
    'facebook/proxEmbed-modelParams.npz',  # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index.

    # the normalization of this model, l2-norm of all parameters
    decay_lstm_W=0.01,
    decay_lstm_U=0.01,
    decay_lstm_b=0.01,
    decay_w=0.01,
):
    """
    The training stage of ProxEmbed
    """
    model_options = locals().copy()

    if wordsEmbeddings is None:
        if wordsEmbeddings_path is not None:
            wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings(
                wordsEmbeddings_path)
        else:
            print 'There is not path for wordsEmbeddings, exit!!!'
            exit(0)

    if subpaths_map is None:
        if subpaths_file is not None:
            subpaths_map = dataProcessTools.loadAllSubPaths(
                subpaths_file, maxlen_subpaths)
        else:
            print 'There is not path for sub-paths, exit!!!'
            exit(0)

    trainingData, trainingPairs = dataProcessTools.getTrainingData(
        trainingDataFile)
    allBatches = dataProcessTools.get_minibatches_idx(len(trainingData),
                                                      batch_size,
                                                      is_shuffle_for_batch)

    params = init_sharedVariables(model_options)
    tparams = init_tparams(params)
    print 'Generate models ......'

    trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost = proxEmbedModelMulti.proxEmbedModel(
        model_options, tparams)

    print 'Generate gradients ......'
    grads = tensor.grad(cost, wrt=list(tparams.values()))
    print 'Using Adadelta to generate functions ......'
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = adadelta(lr, tparams, grads, trainingParis,
                                       subPaths_matrix, subPaths_mask,
                                       subPaths_lens, wemb, cost)

    print 'Start training models ......'
    best_p = None
    history_cost = []

    models_count = [0, 0, 0, 0]

    start_time = time.time()
    print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                         time.localtime(start_time))
    uidx = 0
    for eidx in range(max_epochs):
        for _, batch in allBatches:
            uidx += 1
            trainingDataForBatch = [trainingData[i] for i in batch]
            trainingPairsForBatch = [trainingPairs[i] for i in batch]
            triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data = dataProcessTools.prepareDataForTraining(
                trainingDataForBatch, trainingPairsForBatch, subpaths_map)
            cost = 0
            cost = f_grad_shared(triples_matrix_data, subPaths_matrix_data,
                                 subPaths_mask_data, subPaths_lens_data,
                                 wordsEmbeddings)
            f_update(lrate)

            if numpy.isnan(cost) or numpy.isinf(cost):
                print('bad cost detected: ', cost)
                return
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch =', eidx, ',  Update =', uidx, ',  Cost =', cost
                print 'models_count ==', models_count
            if saveto and numpy.mod(uidx, saveFreq) == 0:
                print('Saving...')
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_cost, **params)
                pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                print('Done')
    end_time = time.time()
    print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(end_time))
    print 'Training finished! Cost time == ', end_time - start_time, ' s'
Пример #4
0
def metagraphAttentionTraining(
    trainingDataFile=main_dir +
    'facebook.splits/train.10/train_classmate_1',  # the full path of training data file
    metagraphEmbeddings_path='',  # the file path of metagraph embeddings
    wordsEmbeddings_data=None,  # words embeddings
    wordsEmbeddings_path=main_dir +
    'facebook/nodesFeatures',  # the file path of words embeddings
    wordsSize=1000000,  # the size of words vocabulary
    subpaths_map=None,  # contains sub-paths
    subpaths_file=main_dir +
    'facebook/subpathsSaveFile',  # the file which contains sub-paths
    maxlen_subpaths=1000,  # the max length for sub-paths
    maxlen=100,  # Sequence longer then this get ignored 
    batch_size=10,  # use a batch for training. This is the size of this batch.
    is_shuffle_for_batch=True,  # if need shuffle for training
    objective_function_method='sigmoid',  # loss function, we use sigmoid here
    objective_function_param=0,  # the parameter in loss function, beta
    lrate=0.0001,  # learning rate
    max_epochs=100,  # the max epochs for training
    dispFreq=5,  # the frequences for display
    saveFreq=5,  # the frequences for saving the parameters
    saveto=main_dir +
    'facebook/path2vec-modelParams.npz',  # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index.

    # all dimensions parameters
    metagraph_embedding_dimension=10,  # metagraph embedding dimension 
    dimension_A=10,  # the dimension of attention when computing the m-node embedding
    dimension_lstm=10,  # dimension of lstm parameters
    dimension_B=10,  # the dimension of attention when computing the m-path embedding
    dimension_C=10,  # the dimension of attention when computing the m-paths embedding

    # decay parameters
    decay_Q_A=0.001,
    decay_b_A=0.001,
    decay_eta_A=0.001,
    decay_lstm_W=0.001,
    decay_lstm_U=0.001,
    decay_lstm_b=0.001,
    decay_Q_B=0.001,
    decay_b_B=0.001,
    decay_eta_B=0.001,
    decay_Q_C=0.001,
    decay_b_C=0.001,
    decay_eta_C=0.001,
    decay_w=0.001,
):
    # get all parameters
    model_options = locals().copy()

    if wordsEmbeddings_data is None:
        if wordsEmbeddings_path is not None:
            wordsEmbeddings_data, dimension, wordsSize = dataProcessTools.getWordsEmbeddings(
                wordsEmbeddings_path)
        else:
            print 'There is not path for wordsEmbeddings, exit!!!'
            exit(0)

    if subpaths_map is None:
        if subpaths_file is not None:
            subpaths_map = dataProcessTools.loadAllSubPathsRomove0Path(
                subpaths_file, maxlen_subpaths, wordsEmbeddings_data)
        else:
            print 'There is not path for sub-paths, exit!!!'
            exit(0)

    metagraphEmbedding_data, metagraphDimension, metagraphSize = dataProcessTools.getMetagraphEmbeddings(
        metagraphEmbeddings_path)

    trainingData, trainingPairs_data = dataProcessTools.getTrainingData(
        trainingDataFile)
    allBatches = dataProcessTools.get_minibatches_idx(len(trainingData),
                                                      batch_size,
                                                      is_shuffle_for_batch)
    '''
        init shared variables
    '''
    params = init_sharedVariables(model_options)
    tparams = init_tparams(params)
    print 'Generate models ......'

    metagraphEmbeddings, trainingParis, subPaths_matrix, subPaths_mask, wordsEmbeddings, cost = subgraphAttentionModelLSTMBatch.metagraphAttentionModel(
        model_options, tparams)

    print 'Generate gradients ......'
    grads = tensor.grad(cost, wrt=list(tparams.values()))
    print 'Using Adadelta to generate functions ......'
    this_time = time.time()
    print 'Start to compile and optimize, time ==', time.strftime(
        '%Y-%m-%d %H:%M:%S', time.localtime(this_time))
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = adadelta(lr, tparams, grads, metagraphEmbeddings,
                                       trainingParis, subPaths_matrix,
                                       subPaths_mask, wordsEmbeddings, cost)

    print 'Start training models ......'
    best_p = None
    history_cost = []  # not use

    start_time = time.time()
    print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                         time.localtime(start_time))
    uidx = 0
    for eidx in range(max_epochs):
        for _, batch in allBatches:
            uidx += 1
            # prepare data for this model
            trainingDataForBatch = [trainingData[i] for i in batch]
            trainingPairsForBatch = [trainingPairs_data[i] for i in batch]
            triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data = dataProcessTools.prepareDataForTraining(
                trainingDataForBatch, trainingPairsForBatch, subpaths_map)
            cost = 0
            cost = f_grad_shared(metagraphEmbedding_data, triples_matrix_data,
                                 subPaths_matrix_data, subPaths_mask_data,
                                 wordsEmbeddings_data)
            f_update(lrate)

            trainingDataForBatch = None
            trainingPairsForBatch = None
            del triples_matrix_data
            del subPaths_matrix_data
            del subPaths_mask_data
            del subPaths_lens_data

            if numpy.isnan(cost) or numpy.isinf(cost):
                print('bad cost detected: ', cost)
                return
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch =', eidx, ',  Update =', uidx, ',  Cost =', cost
                this_time = time.time()
                print 'Time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                               time.localtime(this_time))
            if saveto and numpy.mod(uidx, saveFreq) == 0:
                print('Saving...')
                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)

                numpy.savez(saveto, history_errs=history_cost, **params)
                pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                print('Done')
        gc.collect()

    end_time = time.time()
    print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(end_time))
    print 'Training finished! Cost time == ', end_time - start_time, ' s'
Пример #5
0
def proxEmbedTraining(
        trainingDataFile=main_dir + 'facebook.splits/train.10/train_classmate_1',  # the full path of training data file
        wordsEmbeddings=None,  # words embeddings
        wordsEmbeddings_path=main_dir + 'facebook/nodesFeatures',  # the file path of words embeddings
        word_dimension=22,  # dimension of words embeddings
        dimension=64,  # the dimension of paths embeddings
        wordsSize=1000000,  # the size of words vocabulary
        subpaths_map=None,  # contains sub-paths
        subpaths_file=main_dir + 'facebook/subpathsSaveFile',  # the file which contains sub-paths
        maxlen_subpaths=1000,  # the max length for sub-paths
        h_output_method='mean-pooling',
        # the output way of lstm. There are three ways, "h" only uses the last output h as the output of lstm for one path; "mean-pooling" uses the mean-pooling of all hi as the output of lstm for one path; "max-pooling" uses the max-pooling of all hi as the output of lstm for one path.
        maxlen=100,  # Sequence longer then this get ignored
        batch_size=1,  # use a batch for training. This is the size of this batch.
        is_shuffle_for_batch=False,  # if need shuffle for training
        discount_alpha=0.1,  # the parameter alpha for discount. The longer the subpath, the little will the weight be.
        subpaths_pooling_method='max-pooling',
        # the ways to combine several subpaths to one. "mean-pooling" means to combine all subpaths to one by mean-pooling; "max-pooling" means to combine all subpaths to one by max-pooling.
        objective_function_method='hinge-loss',  # loss function, we use sigmoid
        objective_function_param=0,  # the parameter in loss function, beta
        lrate=0.0001,  # learning rate
        max_epochs=10,  # the max epochs for training

        dispFreq=5,  # the frequences for display
        saveFreq=5,  # the frequences for saving the parameters
        saveto=main_dir + 'facebook/proxEmbed-modelParams.npz',
        # the path for saving parameters. It is generated by main_dir, dataset_name, suffix, class_name and index.

        # the normalization of this model, l2-norm of all parameters
        decay_lstm_W=0.01,
        decay_lstm_U=0.01,
        decay_lstm_b=0.01,
        decay_w=0.01,

        num_group=0,
        dataset_name="",
        class_name="",
        main_dir = ""

):
    """
    The training stage of ProxEmbed
    """
    model_options = locals().copy()
    model_options.pop('wordsEmbeddings')
    print(model_options)

    if wordsEmbeddings is None:
        if wordsEmbeddings_path is not None:
            wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path)
            # print("wordsEmbeddings:", wordsEmbeddings.shape, dimension, wordsSize)
        else:
            print 'There is not path for wordsEmbeddings, exit!!!'
            exit(0)

    if subpaths_map is None:
        if subpaths_file is not None:
            subpaths_map = dataProcessTools.loadAllSubPaths(subpaths_file, maxlen_subpaths)
            # print("subpaths_map:", len(subpaths_map))
            # print(subpaths_map)
        else:
            print 'There is not path for sub-paths, exit!!!'
            exit(0)

    cost_time = []

    for num_of_group in range(num_group):
        num_of_group += 1
        suffix = str(num_of_group)
        index = str(num_of_group)
        trainingDataFile = os.path.join(main_dir + '/', dataset_name + '.splits', "train." + suffix, 'train_' + class_name + '_' + '1')
        saveto = os.path.join(main_dir + '/', dataset_name + '.trainModels', 'train.' + suffix,
                              'train_' + class_name + '_' + index + '.npz')
        trainingData, trainingPairs = dataProcessTools.getTrainingData(trainingDataFile)
        allBatches = dataProcessTools.get_minibatches_idx(len(trainingData), batch_size, is_shuffle_for_batch)

        params = init_sharedVariables(model_options)
        tparams = init_tparams(params)
        print 'Generate models ......'

        trainingParis, subPaths_matrix, subPaths_mask, subPaths_lens, wemb, cost = proxEmbedModelMulti.proxEmbedModel(
            model_options, tparams)
        print("trainingParis:", type(trainingParis), trainingParis.shape)

        print 'Generate gradients ......'
        grads = tensor.grad(cost, wrt=list(tparams.values()))
        print 'Using Adadelta to generate functions ......'
        lr = tensor.scalar(name='lr')
        f_grad_shared, f_update = adadelta(lr, tparams, grads, trainingParis, subPaths_matrix, subPaths_mask,
                                           subPaths_lens,
                                           wemb, cost)

        print 'Start training models ......'
        best_p = None
        history_cost = []

        models_count = [0, 0, 0, 0]

        start_time = time.time()
        print 'start time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))
        uidx = 0
        for eidx in range(max_epochs):
            for _, batch in allBatches:
                uidx += 1
                trainingDataForBatch = [trainingData[i] for i in batch]
                trainingPairsForBatch = [trainingPairs[i] for i in batch]
                triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data = dataProcessTools.prepareDataForTraining(
                    trainingDataForBatch, trainingPairsForBatch, subpaths_map)
                cost = 0
                cost = f_grad_shared(triples_matrix_data, subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data,
                                     wordsEmbeddings)
                f_update(lrate)

                if numpy.isnan(cost) or numpy.isinf(cost):
                    print('bad cost detected: ', cost)
                    return
                if numpy.mod(uidx, dispFreq) == 0:
                    print 'Epoch =', eidx, ',  Update =', uidx, ',  Cost =', cost
                    print 'models_count ==', models_count
                if saveto and numpy.mod(uidx, saveFreq) == 0:
                    print('Saving...')
                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    numpy.savez(saveto, history_errs=history_cost, **params)
                    pickle.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                    print('Done')
        end_time = time.time()
        print 'end time ==', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))
        print 'Training finished! Cost time == ', end_time - start_time, ' s'
        cost_time.append(end_time - start_time)
    return cost_time