예제 #1
0
def main(options):
    # Load configuration file
    config = ConfigParser.RawConfigParser()
    config.read('config.ini')
    sentences_path = config.get("Configuration", "DatasetSentencesPath")
    split_path = config.get("Configuration", "DatasetSplitPath")

    # Section 1
    # Loading datasets
    corpus_split = data.CorpusSplit(split_path)
    corpus_sentences = data.CorpusSentences(sentences_path, corpus_split)

    model_object = None
    parameters = "size of the word embedding (d) = 50\n Num iterations = 10000\n Maximum context window size = 5\n mini-batch size = 50\n noise distribution alpha = 0.01\n number of negative samples per context/input pair K=10\n alpha=0.01\n X=1000"

    if "Main" in options:
        # Section 2 - using this trained model in Deliverable 4-6
        model_hyper = model.ModelHyperParameters(C=5,
                                                 d=50,
                                                 K=10,
                                                 alpha=0.01,
                                                 seed=123.0)
        model_object = model.SkipGramModel(model_hyper)
        model_object.init(corpus_sentences.train)

        # Section 3 - using this trained model in Deliverable 4
        sgd_hyper = algorithm.SGDHyperParameters(alpha=0.01,
                                                 mini_batch_size=50,
                                                 X=1000)
        sgd = algorithm.LearnParamsUsingSGD(training_set=corpus_sentences.train,
                                            hyper_parameters=sgd_hyper,
                                            model=model_object,
                                            iterations_number=10000,
                                            test_set=corpus_sentences.test)

    if "D1" in options:
        print "\nDeliverable 1"
        deliverables.deliverable1(corpus_sentences.train, corpus_sentences.test)

    if "D2" in options:
        print "\nDeliverable 2"
        deliverables.deliverable2(corpus_sentences.train, corpus_sentences.test)

    if "D3" in options:
        print "\nDeliverable 3"
        deliverables.deliverable3(corpus_sentences.train, corpus_sentences.test)

    if "D4" in options:
        print "\nDeliverable 4"
        deliverables.deliverable4(model_object, parameters)

    if "D5" in options:
        print "\nDeliverable 5"
        deliverables.deliverable5(model_object)

    if "D6" in options:
        print "\nDeliverable 6"
        deliverables.deliverable6(model_object)
예제 #2
0
def deliverable1(train, test):
    """
    Plot the log-likelihood train and test as a function of training iteration.
    Train and test plots should appear in the same figure, clearly marked.
    You may use whatever configuration of hyperparameters you wish for this plot, but make sure you specify the choice.
    Hyperparameters can sometimes have a great effect on generalization performance.
    We will consider the effect of the embedding size and other hyperparams.
    """
    parameters = "size of the word embedding d=50\nLearning Rate = 0.1\n Num iterations = 20000\n Maximum context window size = 5\n mini-batch size = 50\n noise distribution alpha = 0.01\n number of negative samples per context/input pair K=10\n X=100"

    model_hyper = model.ModelHyperParameters(
        C=5,
        d=50,
        K=10,
        alpha=
        0.01,  # TODO: noise distribution Uniform (α = 0) (or some other small value of α e.g. 0.01)
        seed=1.0)
    model_object = model.SkipGramModel(model_hyper)
    model_object.init(train)
    sgd_hyper = algorithm.SGDHyperParameters(
        alpha=0.1,  # TODO: Learning Rate  0.01 − 1.0
        mini_batch_size=50,
        X=100)
    timer = time.clock()
    sgd = algorithm.LearnParamsUsingSGD(
        training_set=train,
        hyper_parameters=sgd_hyper,
        model=model_object,
        iterations_number=500,  # TODO: set 20000 before submit
        test_set=test)

    plot_train_test(__sgd_train_learning_curve_path__,
                    __sgd_test_learning_curve_path__,
                    parameters=parameters,
                    output_filename="output/deliverable1_united.png")
    plot_train_test_separate(
        __sgd_train_learning_curve_path__,
        __sgd_test_learning_curve_path__,
        parameters=parameters,
        output_filename="output/deliverable1_separated.png")
예제 #3
0
dataset = data.Word2vecDataset(my_data, args.window_size, args.neg_num)
dataloader = torch.utils.data.DataLoader(dataset,
                                         batch_size=args.batch_size,
                                         collate_fn=dataset.collate)

if args.valid != None:
    valid_dataset = data.ValidDataset(my_data, args.valid, args.window_size,
                                      args.neg_num)
    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=args.batch_size,
        collate_fn=valid_dataset.collate)

vocab_size = len(my_data.word2id)
if args.model == 'sgns':
    skip_gram_model = model.SkipGramModel(vocab_size, args.emsize)
elif args.model == 'cfsgns':
    skip_gram_model = model.ContextFreeSGModel(vocab_size, args.emsize)
else:
    print("No such model:", args.model)
    exit(1)

os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
if use_cuda:
    skip_gram_model.cuda()

epoch_size = dataset.data_len // args.batch_size
optimizer = torch.optim.Adam(skip_gram_model.parameters())
예제 #4
0
if args.valid != None:
    valid_dataset = data.ValidDataset(my_data, args.valid, args.window_size,
                                      args.neg_num)
    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=args.batch_size,
        collate_fn=valid_dataset.collate)

vocab_size = len(my_data.word2id)
aux_vocab_size = vocab_size
# aux_vocab_size = len(my_data.aux_word2id)
if args.model == 'sgns':
    skip_gram_model = model.SkipGramModel(vocab_size,
                                          aux_vocab_size,
                                          args.emsize,
                                          args.aux_emsize,
                                          args.grl_lambda,
                                          revgrad=True)
elif args.model == 'lsgns':
    skip_gram_model = model.LogitSGNSModel(vocab_size,
                                           aux_vocab_size,
                                           args.emsize,
                                           args.aux_emsize,
                                           args.epsilon,
                                           args.grl_lambda,
                                           revgrad=True)
else:
    print("No such model:", args.model)
    exit(1)

os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
예제 #5
0
def deliverable3(train, test):
    """
    Repeat the setup above, but this time fixing d (to your choice) and varying one of {learning rate , mini-batch size, noise distribution}
    (again your choice).
    As before, generate two plots one for training time and one for train and test log-likelihood.
    Clearly specify all your choices.
    """
    parameters = "size of the word embedding (d) = 50\n Num iterations = 20000\n Maximum context window size = 5\n mini-batch size = 50\n noise distribution = Unigram (alpha = 1)\n number of negative samples per context/input pair (denoted K above)=10\n X=1000"
    min_learning_rate = 0.00001
    max_learning_rate = 0.1
    learning_rate_mul = 10
    learning_rate_value = min_learning_rate
    print "dimensions Analysis:"
    with open(__learning_rate_analysis_time__,
              'w') as learning_rate_analysis_time, open(
                  __learning_rate_analysis_train__,
                  'w') as learning_rate_analysis_train, open(
                      __learning_rate_analysis_test__,
                      'w') as learning_rate_analysis_test:
        while learning_rate_value <= max_learning_rate:
            model_hyper = model.ModelHyperParameters(C=5,
                                                     d=50,
                                                     K=10,
                                                     alpha=1.0,
                                                     seed=1.0)
            model_object = model.SkipGramModel(model_hyper)
            model_object.init(train)
            sgd_hyper = algorithm.SGDHyperParameters(alpha=learning_rate_value,
                                                     mini_batch_size=50,
                                                     X=1000)
            timer = time.clock()
            sgd = algorithm.LearnParamsUsingSGD(training_set=train,
                                                hyper_parameters=sgd_hyper,
                                                model=model_object,
                                                iterations_number=20000,
                                                test_set=test)

            # output training time to file
            training_time = time.clock() - timer
            print "Dimension: {0},Training Time: {1}".format(
                learning_rate_value, training_time)
            learning_rate_analysis_time.write("{0},{1}\n".format(
                learning_rate_value, training_time))

            # output train and test MEAN log-likelihood
            sgd_train_learning_curve_df = pd.read_csv(
                __sgd_train_learning_curve_path__,
                header=None,
                names=[
                    "time", "Iteration", "log_Likelihood", "avg_log_Likelihood"
                ])
            sgd_test_learning_curve_df = pd.read_csv(
                __sgd_test_learning_curve_path__,
                header=None,
                names=[
                    "time", "Iteration", "log_Likelihood", "avg_log_Likelihood"
                ])
            train_mean_log_likelihood = sgd_train_learning_curve_df[
                "avg_log_Likelihood"].mean()
            test_mean_log_likelihood = sgd_test_learning_curve_df[
                "avg_log_Likelihood"].mean()
            print "Dimension: {0},train_mean_log_likelihood: {1}".format(
                learning_rate_value, train_mean_log_likelihood)
            print "Dimension: {0},test_mean_log_likelihood: {1}".format(
                learning_rate_value, test_mean_log_likelihood)
            learning_rate_analysis_train.write("{0},{1}\n".format(
                learning_rate_value, train_mean_log_likelihood))
            learning_rate_analysis_test.write("{0},{1}\n".format(
                learning_rate_value, test_mean_log_likelihood))

            learning_rate_value *= learning_rate_mul

    # 1. plot training time as func of learning_rate
    learning_rate_analysis_time_df = pd.read_csv(
        __learning_rate_analysis_time__,
        header=None,
        skiprows=0,
        names=["learning_rate", "time"])
    _title = unicode(
        'SGD training time as a function of learning_rate\nParameters: '
    ) + parameters.decode('utf-8')
    ax = learning_rate_analysis_time_df.plot(x='learning_rate',
                                             y='time',
                                             title=_title)
    ax.set_xlabel("learning_rate")
    ax.set_ylabel("training time")
    plt.savefig("output\deliverable3_training_time.png", bbox_inches='tight')

    # 2. train and test (mean) log-likelihood as a function of d
    sgd_train_learning_curve_df = pd.read_csv(
        __learning_rate_analysis_train__,
        header=None,
        names=["learning_rate", "mean_log_likelihood"])
    sgd_test_learning_curve_df = pd.read_csv(
        __learning_rate_analysis_test__,
        header=None,
        names=["learning_rate", "mean_log_likelihood"])
    fig = plt.figure()
    ax = fig.add_axes([0, 0, 1, 1])
    ax.plot(sgd_train_learning_curve_df['learning_rate'],
            sgd_train_learning_curve_df['mean_log_likelihood'],
            'r',
            label="train mean log(L)")
    ax.plot(sgd_test_learning_curve_df['learning_rate'],
            sgd_test_learning_curve_df['mean_log_likelihood'],
            'b',
            label="test mean log(L)")
    ax.legend()
    ax.set_xlabel("learning_rate")
    ax.set_ylabel("mean_log_likelihood")
    _title = unicode(
        'SGD Train-Test mean log-Likelihood as a function of learning_rate\nParameters: '
    ) + parameters.decode('utf-8')
    ax.set_title(_title)
    fig.savefig("output\deliverable3_mean_log_likelihood.png",
                bbox_inches='tight')
    plt.close(fig)
예제 #6
0
def deliverable2(train, test):
    """
    Set the hyper params as follows:
        1. Learning Rate = 0.3
        2. Num iterations = 20000
        3. Maximum context window size = 5
        4. mini-batch size = 50
        5. noise distribution = Unigram (alpha = 1)
        6. number of negative samples per context/input pair (denoted K above)=10
    Vary the size of the word embedding ,d from 10 to 300 in 5 evenly spaced intervals.
    In two separate plots, plot both training time and train and test (mean) log-likelihood as a function of d.
    All hyper-parameter configurations of the algorithm should be clearly specified.
    """
    parameters = "Learning Rate = 0.01\n Num iterations = 20000\n Maximum context window size = 5\n mini-batch size = 50\n noise distribution = Unigram (alpha = 1)\n number of negative samples per context/input pair K=10\n X=1000"
    min_d = 10  # min_d = 10
    max_d = 300  # max_d = 300
    d_adder = 70  # d_adder = 70
    d_value = min_d
    print "dimensions Analysis:"
    with open(__dimensions_analysis_time__,
              'w') as dimensions_analysis_time, open(
                  __dimensions_analysis_train__,
                  'w') as dimensions_analysis_train, open(
                      __dimensions_analysis_test__,
                      'w') as dimensions_analysis_test:
        while d_value <= max_d:
            model_hyper = model.ModelHyperParameters(C=5,
                                                     d=d_value,
                                                     K=10,
                                                     alpha=1.0,
                                                     seed=1.0)
            model_object = model.SkipGramModel(model_hyper)
            model_object.init(train)
            sgd_hyper = algorithm.SGDHyperParameters(alpha=0.01,
                                                     mini_batch_size=50,
                                                     X=1000)
            timer = time.clock()
            sgd = algorithm.LearnParamsUsingSGD(training_set=train,
                                                hyper_parameters=sgd_hyper,
                                                model=model_object,
                                                iterations_number=20000,
                                                test_set=test)

            # output training time to file
            training_time = time.clock() - timer
            print "Dimension: {0},Training Time: {1}".format(
                d_value, training_time)
            dimensions_analysis_time.write("{0},{1}\n".format(
                d_value, training_time))

            # output train and test MEAN log-likelihood
            sgd_train_learning_curve_df = pd.read_csv(
                __sgd_train_learning_curve_path__,
                header=None,
                names=[
                    "time", "Iteration", "log_Likelihood", "avg_log_Likelihood"
                ])
            sgd_test_learning_curve_df = pd.read_csv(
                __sgd_test_learning_curve_path__,
                header=None,
                names=[
                    "time", "Iteration", "log_Likelihood", "avg_log_Likelihood"
                ])
            train_mean_log_likelihood = sgd_train_learning_curve_df[
                "avg_log_Likelihood"].mean()
            test_mean_log_likelihood = sgd_test_learning_curve_df[
                "avg_log_Likelihood"].mean()
            print "Dimension: {0},train_mean_log_likelihood: {1}".format(
                d_value, train_mean_log_likelihood)
            print "Dimension: {0},test_mean_log_likelihood: {1}".format(
                d_value, test_mean_log_likelihood)
            dimensions_analysis_train.write("{0},{1}\n".format(
                d_value, train_mean_log_likelihood))
            dimensions_analysis_test.write("{0},{1}\n".format(
                d_value, test_mean_log_likelihood))

            d_value += d_adder

    # 1. plot training time as func of d
    dimensions_analysis_time_df = pd.read_csv(__dimensions_analysis_time__,
                                              header=None,
                                              skiprows=0,
                                              names=["d", "time"])
    _title = unicode('SGD training time as a function of d\nParameters: '
                     ) + parameters.decode('utf-8')
    ax = dimensions_analysis_time_df.plot(x='d', y='time', title=_title)
    ax.set_xlabel("d - size of the word embedding")
    ax.set_ylabel("training time")
    plt.savefig("output\deliverable2_training_time.png", bbox_inches='tight')

    # 2. train and test (mean) log-likelihood as a function of d
    sgd_train_learning_curve_df = pd.read_csv(
        __dimensions_analysis_train__,
        header=None,
        names=["d", "mean_log_likelihood"])
    sgd_test_learning_curve_df = pd.read_csv(
        __dimensions_analysis_test__,
        header=None,
        names=["d", "mean_log_likelihood"])
    fig = plt.figure()
    ax = fig.add_axes([0, 0, 1, 1])
    ax.plot(sgd_train_learning_curve_df['d'],
            sgd_train_learning_curve_df['mean_log_likelihood'],
            'r',
            label="train mean log(L)")
    ax.plot(sgd_test_learning_curve_df['d'],
            sgd_test_learning_curve_df['mean_log_likelihood'],
            'b',
            label="test mean log(L)")
    ax.legend()
    ax.set_xlabel("d - size of the word embedding")
    ax.set_ylabel("mean_log_likelihood")
    _title = unicode(
        'SGD Train-Test mean log-Likelihood as a function of d\nParameters: '
    ) + parameters.decode('utf-8')
    ax.set_title(_title)
    fig.savefig("output\deliverable2_mean_log_likelihood.png",
                bbox_inches='tight')
    plt.close(fig)