def test():
    glv, glv_w2i, glv_vocab = load_embedding("data/glove.txt")
    words_sorted = compute_word_bias(glv, glv_w2i, glv_vocab)
    for n in [100, 500, 1000]:
        my_cluster(glv, glv_w2i, 1, glv_vocab, words_sorted, n)

    hard_debias()  # Uncomment to create the hard_debias word vector embedding
    hd_glv, hd_glv_w2i, hd_glv_vocab = load_embedding("hard_debias.txt")

    for n in [100, 500, 1000]:
        my_cluster(hd_glv, hd_glv_w2i, 1, hd_glv_vocab, words_sorted, n)

    embedding_filepath = './data/glove.txt'
    male_filepath = './data/male_words.txt'
    female_filepath = './data/female_words.txt'
    pairs_filepath = './data/definitional_pairs.json'

    dbl_glv, dbl_w2i, dbl_vocab = double_hard_debias(embedding_filepath,
                                                     male_filepath,
                                                     female_filepath,
                                                     pairs_filepath)

    for n in [100, 500, 1000]:
        my_cluster(dbl_glv, dbl_w2i, 1, dbl_vocab, words_sorted)


#test()
Пример #2
0
 def __init__(self, vocab_size, embedding_dim, hidden_dim, tag2idx,
              batch_size, use_gpu, idx2word, emb_path):
     super(AttentionModel, self).__init__()
     self.vocab_size = vocab_size
     self.embedding_dim = embedding_dim
     self.hidden_dim = hidden_dim
     self.tag2idx = tag2idx
     self.target_size = len(tag2idx)
     self.lstm = nn.LSTM(embedding_dim,
                         hidden_dim // 2,
                         num_layers=1,
                         bidirectional=True,
                         batch_first=True)
     self.hidden2tags = nn.Linear(hidden_dim, self.target_size)
     self.batch_size = batch_size
     self.use_gpu = use_gpu
     self.idx2word = idx2word
     self.emb_path = emb_path
     # pretrain embeddings
     emb_vectors = load_embedding(self.emb_path, self.idx2word)
     self.embeds = nn.Embedding.from_pretrained(
         torch.from_numpy(emb_vectors).float(), freeze=True)
     self.dropout = torch.nn.Dropout(0.5)
     self.query = nn.Parameter(torch.randn(self.hidden_dim),
                               requires_grad=True)
Пример #3
0
def main():
    device = torch.device('cuda' if torch.cuda.is_available() and args.use_gpu else 'cpu')

    try:
        _, dataset, embed, model_type, model_name = args.model_path.split('/')
        model = DelhateEnsemble.load_model(args.model_path)
    except FileNotFoundError:
        raise

    embedding, dim = utils.load_embedding(model.embed_corpus)

    test_data = utils.load_dataset(args.dataset, 'test', embedding, labeled=True, pad=model.seq_length)

    model.to(device)
    y_pred, y_true = model.evaluate(test_data, device=device)

    print('pred:', Counter(y_pred))
    print('true:', Counter(y_true))

    report = classification_report(y_true, y_pred, target_names=['H', 'O', 'N'], digits=3)
    conf_mat = confusion_matrix(y_true, y_pred)

    model_name = model_name.replace('.pt', '')
    out_path = f'metrics/{dataset.upper()}/{embed}/{model_type}'
    os.makedirs(out_path, exist_ok=True)

    with open(f'{out_path}/{model_name}_{args.dataset}.txt', 'w') as f:
        f.write(report)
        f.write('\n')
        f.write('\n'.join('  '.join(str(x) for x in y) for y in conf_mat))
        f.write('\n')
def main():
    args = docopt(__doc__)
    enable_all_pools = args['--enable-all-pools']

    hidden = int(args['--hidden'])
    dropout = float(args['--dropout'])
    device = torch.device(int(args['--device']))
    print(f"{device} will be used")
    ratio = 0.8

    valid_dset = QueryDataset(split='valid',
                              ratio=ratio,
                              equally_handle_foreign_authors=False)
    valid_loader = DataLoader(valid_dset,
                              batch_size=1,
                              num_workers=1,
                              shuffle=False)

    embedding_mode, embedding = load_embedding(args['--embedding'], False,
                                               device)
    classifier = Classifier(embedding,
                            hidden,
                            dropout,
                            args['--deepset'],
                            equally_handle_foreign_authors=False,
                            enable_all_pools=enable_all_pools)
    classifier.load_state_dict(torch.load(args['--classifier']))
    classifier.eval()

    if torch.cuda.is_available():
        classifier.to(device)

    thresholds = [0.05 * i for i in range(1, 20)]
    for thres in thresholds:
        test_classifier(valid_loader, classifier, device, thres)
Пример #5
0
    def test_load_embedding(self):
        print '======================================='
        print '\n\nload_embedding:'
        lines = self.loadcorpus()
        train_features, train_labels, f_map, _, c_map = utils.generate_corpus_char(
            lines,
            if_shrink_c_feature=True,
            c_thresholds=5,
            if_shrink_w_feature=False)
        f_set = {v for v in f_map}
        # map: return a new list based on old list
        # reduce: accumulate values and operate it with new values.
        dt_f_set = functools.reduce(lambda x, y: x | y,
                                    map(lambda t: set(t), train_features),
                                    f_set)
        f_map = utils.shrink_features(f_map, train_features, 5)

        f_map, embedding_tensor, in_doc_words = utils.load_embedding(
            '/datastore/liu121/nosqldb2/acl_hscrf/skipgram',
            ' ',
            f_map,
            dt_f_set,
            'unk',
            200,
            shrink_to_corpus=True,
            embsave_filePath=
            '/datastore/liu121/nosqldb2/acl_hscrf/pkl/analysis_table.pkl')
Пример #6
0
    def __init__(self, args):
        self.batch_size = args.batch_size
        self.hidden_size = args.hidden_size
        self.emb_size = args.emb_size
        self.emb_trainable = args.emb_trainable
        self.load_glove = args.load_glove
        self.num_max_epochs = args.num_max_epochs
        self.learning_rate = args.learning_rate

        # Load data

        data_loader = None
        if args.dataset == 'moviereview': data_loader = MRDataLoader
        elif args.dataset == 'senti140': data_loader = S140DataLoader
        else:
            print 'wrong data'
            sys.exit(1)

        loader = data_loader(data_path='../data/%s/' % (args.dataset),
                             pad_size=20,
                             max_vocab=100000)
        loader.read_data()
        self.num_class = loader.num_class
        self.vocab = loader.vocab

        self.vocab_rev = {w: i for i, w in enumerate(loader.vocab)}
        self.vocab_size = len(loader.vocab)

        # Data iterators
        self.train_iter = PaddingDatawithTarget(loader.train)
        self.test_iter = PaddingDatawithTarget(loader.test)

        # Load glove
        if self.load_glove:
            #             self.emb = load_glove(
            # emb_path = '../data/glove.6B/',
            # emb_filename= 'glove.6B.300d.txt', # 'test.txt', #
            # vocab = self.vocab,
            # emb_size = self.emb_size)
            #         self.emb_size = self.emb.shape[1]
            #NOTE I change loading binary file, and different directory
            self.emb = load_embedding(
                emb_path='/data/word2vec/',
                emb_filename='glove.42B.300d.w2v.bin',  # 'test.txt', #
                vocab=self.vocab,
                emb_size=self.emb_size)
            self.emb_size = self.emb.shape[1]

        print ' '.join([self.vocab[w] for w in loader.train['X'][0]])
        print loader.train['length'][0], loader.train['Y'][0]
        print ' '.join([self.vocab[w] for w in loader.train['X'][1]])
        print loader.train['length'][1], loader.train['Y'][1]
        print loader.train['Y'][:10]
        print loader.test['Y'][:10]
        #import pdb; pdb.set_trace()

        self.sess = None
Пример #7
0
    def __init__(self, args):

        self.out_path = args.out_path
        self.data_path = args.data_path
        self.target = args.target

        self.emb = load_embedding(emb_path='/data/word2vec/',
                                  emb_filename='glove.42B.300d.w2v.bin')

        self.emb_size = len(self.emb['the'])
    def __init__(self, args):
        super().__init__()
        self.word_embeddings = nn.Embedding(args.vocab_size,
                                            args.embedding_size,
                                            padding_idx=0)
        self.dropout = nn.Dropout(args.embedding_dropout_prob)

        embedding = load_embedding(args)
        self.word_embeddings.weight.requires_grad = not args.fix_embedding  # False
        self.word_embeddings.weight.data.copy_(torch.from_numpy(embedding))
Пример #9
0
Файл: main.py Проект: zhyq/acnn
def init():
    path = config.data_path
    config.embedding_file = os.path.join(path, config.embedding_file)
    config.embedding_vocab = os.path.join(path, config.embedding_vocab)
    config.train_file = os.path.join(path, config.train_file)
    config.test_file = os.path.join(path, config.test_file)

    # Config log
    if config.log_file is None:
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(message)s',
                            datefmt='%m-%d %H:%M')
    else:
        if not os.path.exists(config.save_path):
            os.makedirs(config.save_path)
        logging.basicConfig(filename=config.log_file,
                            filemode='a',
                            level=logging.DEBUG,
                            format='%(asctime)s %(message)s',
                            datefmt='%m-%d %H:%M')
    # Load data
    # data = (sentences, relations, e1_pos, e2_pos)
    train_data = utils.load_data(config.train_file)
    test_data = utils.load_data(config.test_file)

    logging.info('trian data: %d' % len(train_data[0]))
    logging.info('test data: %d' % len(test_data[0]))

    # Build vocab
    word_dict = utils.build_dict(train_data[0] + test_data[0])
    logging.info('total words: %d' % len(word_dict))

    embeddings = utils.load_embedding(config, word_dict)

    # Log parameters
    flags = config.__dict__['__flags']
    flag_str = "\n"
    for k in flags:
        flag_str += "\t%s:\t%s\n" % (k, flags[k])
    logging.info(flag_str)

    # vectorize data
    # vec = (sents_vec, relations, e1_vec, e2_vec, dist1, dist2)
    max_len_train = len(max(train_data[0], key=lambda x: len(x)))
    max_len_test = len(max(test_data[0], key=lambda x: len(x)))
    max_len = max(max_len_train, max_len_test)
    config.max_len = max_len

    train_vec = utils.vectorize(train_data, word_dict, max_len)
    test_vec = utils.vectorize(test_data, word_dict, max_len)

    return embeddings, train_vec, test_vec
Пример #10
0
 def __init__(self, vocab_size, embedding_dim, hidden_dim, tag2idx, batch_size, use_gpu, idx2word, emb_path):
     super(bilstm_crf, self).__init__()
     self.vocab_size = vocab_size
     self.embedding_dim = embedding_dim
     self.hidden_dim = hidden_dim
     self.tag2idx = tag2idx
     self.target_size = len(tag2idx)
     self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
     self.hidden2tags = nn.Linear(hidden_dim, self.target_size)
     self.batch_size = batch_size
     self.use_gpu = use_gpu
     self.idx2word = idx2word
     self.emb_path = emb_path
     # pretrain embeddings
     emb_vectors = load_embedding(self.emb_path, self.idx2word)
     self.embeds = nn.Embedding.from_pretrained(torch.from_numpy(emb_vectors).float(),
                                                   freeze=True)  # V x D
Пример #11
0
def main():
    device = torch.device(
        'cuda' if torch.cuda.is_available() and args.use_gpu else 'cpu')

    rnn_str = args.rnn_type if args.rnn_type else 'cnn'
    weak_str = '_weak' if args.weak_loss else ''

    out_path = f'models/{args.dataset}/{args.embed_corpus}/delhate_{rnn_str}{weak_str}'
    os.makedirs(out_path, exist_ok=True)

    embedding, dim = utils.load_embedding(args.embed_corpus)

    labeled = not args.weak_loss

    train_data = utils.load_dataset(args.dataset, 'train', embedding, labeled,
                                    args.pad)

    model = DelhateEnsemble(n_models=args.n_models,
                            seq_length=train_data.padded_seq,
                            embed_corpus=args.embed_corpus,
                            embed_dim=dim,
                            n_classes=train_data.n_classes,
                            n_filters=args.n_filters,
                            filter_width=args.filter_width,
                            pool_size=args.pool_size,
                            n_hidden=args.n_hidden,
                            rnn_type=args.rnn_type,
                            dropout=args.dropout)

    if args.weak_loss:
        loss_fn = lambda x, y: utils.weak_loss(x, y, weight=args.class_weight)
    else:
        loss_fn = F.cross_entropy

    model.train_models(train_data,
                       loss_fn=loss_fn,
                       lr=args.learn_rate,
                       n_samples=args.n_samples,
                       use_val=args.use_val,
                       early_stop=args.early_stop,
                       batch_size=args.batch_size,
                       EPOCHS=args.epochs,
                       device=device)

    model.save_model(f'{out_path}/{args.model_name}.pt')
Пример #12
0
def main(argv=None):
    if FLAGS.non_linearity == 'tanh':
        non_linearity = tf.nn.tanh
    elif FLAGS.non_linearity == 'sigmoid':
        non_linearity = tf.nn.sigmoid
    else:
        non_linearity = tf.nn.relu

    train_url = os.path.join(FLAGS.data_dir, 'train.feat')
    test_url = os.path.join(FLAGS.data_dir, 'test.feat')
    vocab_url = os.path.join(FLAGS.data_dir, 'vocab.new')
    model_url = os.path.join(FLAGS.model_dir, '')

    train(
        train_url=train_url,
        test_url=test_url,
        vocab_url=vocab_url,
        model_url=model_url,
        non_linearity=non_linearity,
        embedding_url=FLAGS.embedding_file,
        training_epochs=FLAGS.training_epochs,
        alternate_epochs=FLAGS.alternate_epochs,
        vocab_size=FLAGS.vocab_size,
        embedding_size=FLAGS.embedding_size,
        n_hidden=FLAGS.n_hidden,
        n_topic=FLAGS.n_topic,
        n_sample=FLAGS.n_sample,
        learning_rate=FLAGS.learning_rate,
        batch_size=FLAGS.batch_size,
        is_training=True,
        mix_num=FLAGS.mix_num,
    )

    # ------------------ print top words ----------------------------
    with tf.Session() as sess:
        saver = tf.train.Saver()
        saver.restore(sess, model_url)
        # find the names of all variable
        for v in tf.trainable_variables():
            print(v.name, v.shape)

        embedding_table = utils.load_embedding(
            embedding_url, embedding_size, vocab,
            FLAGS.data_dir + '/vocab_embedding-{}.pkl'.format(embedding_size))
        TopicWords(sess, vocab_url, embedding_table)
Пример #13
0
def main():
    args = docopt(__doc__)
    device = torch.device(int(args['--device']))
    print(f"{device} will be used")
    threshold = float(args['--threshold'])
    answer_path = args['--answer-path']
    query_path = args['--query-path']
    hidden = int(args['--hidden'])
    dropout = float(args['--dropout'])
    enable_all_pools = args['--enable-all-pools']

    if os.path.exists(answer_path):
        warnings.warn(
            'Answer file already exists. Please delete it before run the code. Otherwise, lines will be appended.'
        )

    testset = QueryTestset(query_path)
    testloader = DataLoader(testset,
                            batch_size=1,
                            num_workers=1,
                            shuffle=False)

    embedding_mode, embedding = load_embedding(args['--embedding'], None,
                                               device)
    classifier = Classifier(embedding,
                            hidden,
                            dropout,
                            args['--deepset'],
                            False,
                            enable_all_pools=enable_all_pools).to(device)
    classifier.load_state_dict(torch.load(args['--classifier']))
    classifier.eval()

    with torch.no_grad():
        for collab in testloader:
            score = classifier(collab.to(device))
            if score >= threshold:
                answer = True
            else:
                answer = False
            with open(answer_path, 'a') as f:
                f.write(str(answer) + '\n')
Пример #14
0
def hard_debias(path_to_embedding="Double-Hard Debias/embeddings/glove.txt",
                path_to_def_pairs="Hard Debias/Data/definitional_pairs.json"):
    word_vectors, word_indices, vocab = load_embedding(path_to_embedding)
    word_vectors = np.asarray(word_vectors)

    with open(path_to_def_pairs) as f:
        set_of_pairs = json.load(f)

    mu_list = calculate_mu(set_of_pairs, word_vectors, word_indices)
    gender_subspace = calculate_gender_direction(set_of_pairs,
                                                 mu_list,
                                                 word_vectors,
                                                 word_indices,
                                                 num_components=1)
    gender_direction = gender_subspace[0]

    ### Subtracting Gender Bias from each Word Vector
    for i in range(len(word_vectors)):
        word_vectors[i] = word_vectors[i] - np.dot(
            word_vectors[i], gender_direction) * gender_direction

    word_vectors = normalize(word_vectors)
    recreate_embedding(word_vectors, vocab, "hard_debias")
def main(args):
    df_gold = pd.read_csv(args.goldstandard, index_col=0)

    for emb_path in args.embedding:
        print("=" * 78)
        print("Processing embedding file:", emb_path)
        print("-" * 78)
        df_embedding = load_embedding(emb_path, as_dataframe=True)


        # align embedding and gold standard
        df = df_gold.join(df_embedding, how='inner')
        # df = pd.merge(df_gold, df_embedding, left_index=True, right_index=True, how='inner')

        le = preprocessing.LabelEncoder()
        y = le.fit_transform(df['top'].values)

        # First column is label column
        X = df[df.columns[1:]].values

        print("N examples", X.shape[0])
        print("N targets", len(le.classes_))

        if args.normalize:
            print("Normalizing...")
            X = preprocessing.normalize(X, norm='l2')

        # Linear SVM with default parameters
        clf = svm.SVC(kernel=args.kernel)

        print("Running {}-cross-validated SVM with {} kernel...".format(args.cv, args.kernel))
        scores = cross_val_score(clf, X, y, cv=args.cv)

        print("Accuracy scores", scores)

        print("Accuracy mean/std:", scores.mean(), scores.std())
        print("=" * 78)
def clip_embedding_matrix(embedding_file, input_files, output_dir,
                          embedding_name):
    vocab_file = os.path.join(output_dir, 'vocab.txt')
    clipped_file = os.path.join(output_dir, embedding_name)

    # load all files and build the vocabulary
    all_texts = load_all_texts(input_files)
    tokenizer = Tokenizer(num_words=None, lower=False)
    tokenizer.fit_on_texts(all_texts)
    logger.info("the size of vocabulary is {}".format(
        len(tokenizer.word_counts)))

    # load word vector and build embedding matrix
    embeddings_index = load_embedding(embedding_file)
    embedding_matrix = build_matrix(embeddings_index, tokenizer.word_index)
    logger.info("the shape of embedding matrix is {}".format(
        embedding_matrix.shape))

    # save embedding matrix and vocabulary
    np.save(clipped_file, embedding_matrix)  # save embedding matrix
    # save vocabulary
    words = [word + '\n' for word in list(tokenizer.word_index.keys())]
    with open(vocab_file, 'w', encoding='utf-8') as f:
        f.writelines(words)
Пример #17
0
    test_summary_dir = os.path.join(out_dir, "summaries", timestamp, "test")
    test_summary_writer = tf.summary.FileWriter(test_summary_dir, sess.graph)
    test_summary_writer.add_graph(sess.graph)

    checkpoint_dir = os.path.abspath(
        os.path.join(out_dir, "checkpoints", timestamp))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints)

    merged_summary = tf.summary.merge_all()
    log('summary', logfile=logpath, is_verbose=is_verbose)
    """Loading pretrained embedding"""
    if use_pretrained_model:
        load_embedding(sess, word_to_index, word_embeddings, embeddingpath,
                       embedding_size, vocab_size)

    # Get a batch with the dataloader and transfrom it into tokens
    batches = dataloader_train.get_batches(batch_size, num_epochs=num_epochs)
    batches_eval = dataloader_eval.get_batches(batch_size,
                                               num_epochs=num_epochs)
    for num_batch, batch in enumerate(batches):
        log("starting batch",
            num_batch,
            logfile=logpath,
            is_verbose=is_verbose)
        batch = word_to_index_transform(word_to_index, batch)
        # Defining input and target sequences
        batch_input, batch_target = batch[:, :-1], batch[:, 1:]
        # Run the session
        _, logits, out_loss, computed_perplexity = sess.run(
Пример #18
0
def main(args):
    model_class = get_model_class(args.model)
    model_class.add_config(argparser)
    args = argparser.parse_args()
    say(args)

    args.run_id = random.randint(0, 10**9)
    args.run_path = "{}/{}".format(args.run_dir, args.run_id)
    #if not os.path.exists(args.run_dir):
    #    os.makedirs(args.run_dir)
    #assert os.path.isdir(args.run_dir)
    #assert not os.path.exists(args.run_path)
    #os.makedirs(args.run_path)
    say("\nRun ID: {}\nRun Path: {}\n\n".format(args.run_id, args.run_path))

    train_corpus_path = os.path.dirname(args.train) + "/corpus.tsv.gz"
    train_corpus = Corpus(
        [tuple([train_corpus_path,
                os.path.dirname(args.train)])])
    valid_corpus_path = os.path.dirname(args.eval) + "/corpus.tsv.gz"
    valid_corpus = Corpus(
        [tuple([valid_corpus_path,
                os.path.dirname(args.eval)])])
    say("Corpus loaded.\n")

    embs = load_embedding(args.embedding) if args.embedding else None

    embedding_layer = EmbeddingLayer(args.n_d, ['<s>', '</s>'], embs)

    model = model_class(embedding_layer, args)

    if args.cuda:
        model.cuda()
    say("\n{}\n\n".format(model))

    print model.state_dict().keys()

    needs_grad = lambda x: x.requires_grad
    optimizer = optim.Adam(filter(needs_grad, model.parameters()), lr=args.lr)

    if args.load_model:
        print "Loading pretrained model"
        model.load_state_dict(torch.load(args.load_model))

    else:
        print "Training will begin from scratch"

    best_dev = 0
    iter_cnt = 0

    current_dev = evaluate(iter_cnt, args.eval + "/dev", model, valid_corpus,
                           args)
    evaluate(iter_cnt, args.eval + "/test", model, valid_corpus, args, False)

    for epoch in range(args.max_epoch):
        iter_cnt = train(iter_cnt, model, train_corpus, args, optimizer)
        current_dev = evaluate(iter_cnt, args.eval + "/dev", model,
                               valid_corpus, args)
        if current_dev > best_dev:
            best_dev = current_dev
            evaluate(iter_cnt, args.eval + "/test", model, valid_corpus, args,
                     False)
        say("\n")

    if args.save_model:
        torch.save(model.state_dict(), args.save_model)
Пример #19
0
    test_summary_dir = os.path.join(out_dir, "summaries", timestamp, "test")
    test_summary_writer = tf.summary.FileWriter(test_summary_dir, sess.graph)
    test_summary_writer.add_graph(sess.graph)

    checkpoint_dir = os.path.abspath(
        os.path.join(out_dir, "checkpoints", timestamp))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints)

    merged_summary = tf.summary.merge_all()
    log('summary', logfile=logpath, is_verbose=is_verbose)
    """Loading pretrained embedding"""
    if use_pretrained_model:
        load_embedding(sess, word_to_index, word_embeddings,
                       './wordembeddings.word2vec', embedding_size, vocab_size)

    # Get a batch with the dataloader and transfrom it into tokens
    sess.run(tf.global_variables_initializer())
    batches = dataloader_train.get_batches(batch_size, num_epochs=num_epochs)
    batches_eval = dataloader_eval.get_batches(batch_size,
                                               num_epochs=num_epochs)
    for num_batch, batch in enumerate(batches):
        print(num_batch)

        log("starting batch",
            num_batch,
            logfile=logpath,
            is_verbose=is_verbose)
        batch = word_to_index_transform(word_to_index, batch)
        # Defining input and target sequences
def run_experiment(experiment_type, data_folder, save_model_folder,
                   save_results_folder):
    """
    Runs experiments and saves results

    Parameters
    ----------
    experiment_type
    data_folder
    save_model_folder
    save_results_folder
    """
    def set_experiment_variables(hidden_state_size=512,
                                 down_project_size=None,
                                 load_embeddings=False):
        tf.flags.DEFINE_integer("hidden_state_size", hidden_state_size,
                                "hidden state size (default 512)")
        tf.flags.DEFINE_integer(
            "down_project_size", down_project_size,
            "Down projection size. Should be used with a hidden_state_size of 1024 (default None)"
        )
        tf.flags.DEFINE_boolean(
            "load_embeddings", load_embeddings,
            "Whether to use pretrained embeddings or not (default False)")

    if experiment_type == 'A':
        set_experiment_variables(512, None, False)
    elif experiment_type == 'B':
        set_experiment_variables(512, None, True)
    elif experiment_type == 'C':
        set_experiment_variables(1024, 512, True)

    print("\nExperiment Arguments:")
    for key in FLAGS.flag_values_dict():
        if key == 'f':
            continue
        print("{:<22}: {}".format(key.upper(), FLAGS[key].value))
    print(" ")

    data_processing = DataProcessing(FLAGS.sentence_length,
                                     FLAGS.max_vocabulary_size)
    train_corpus = data_processing.preprocess_dataset(data_folder,
                                                      'sentences.train')
    validation_corpus = data_processing.preprocess_dataset(
        data_folder, 'sentences.eval')
    test_corpus = data_processing.preprocess_dataset(data_folder,
                                                     'sentences_test.txt')
    continuation_corpus = data_processing.preprocess_dataset(
        data_folder, 'sentences.continuation', pad_to_sentence_length=False)

    print(f'Number of train sentences is \t\t{len(train_corpus)}')
    print(f'Number of validation sentences is \t{len(validation_corpus)}')
    print(f'Number of test sentences is \t\t{len(test_corpus)}')
    print(f'Number of continuation sentences is \t{len(continuation_corpus)}')
    print(" ")

    best_perplexity = None
    best_model = None

    with tf.Graph().as_default():
        with tf.Session() as session:
            # Create a variable to contain a counter for the global training step.
            global_step = tf.Variable(1, name='global_step', trainable=False)

            lstm = LSTMCell(FLAGS.embedding_size,
                            FLAGS.hidden_state_size,
                            FLAGS.sentence_length,
                            FLAGS.max_vocabulary_size,
                            down_project_size=FLAGS.down_project_size,
                            pad_symbol=data_processing.vocab['<pad>'])

            if FLAGS.load_embeddings:
                load_embedding(session, data_processing.vocab,
                               lstm.input_embeddings,
                               data_folder + '/wordembeddings-dim100.word2vec',
                               FLAGS.embedding_size,
                               len(data_processing.vocab))

            ####
            # Set optimizer and crop all gradients to values [-5, 5]
            ####
            with tf.name_scope('train'):
                optimizer = tf.train.AdamOptimizer()
                gvs = optimizer.compute_gradients(lstm.loss)
                capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var)
                              for grad, var in gvs]
                train_step = optimizer.apply_gradients(capped_gvs,
                                                       global_step=global_step)

            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            session.run(tf.global_variables_initializer())
            summaries_merged = tf.summary.merge(lstm.summaries)

            ####
            # Create checkpoint directory
            ####
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(save_model_folder, "runs", timestamp))
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)

            ####
            # Start training for the specified epochs
            ####
            print('Start training...')
            for epoch in range(FLAGS.num_epochs):
                for sentences_batch in get_batches(
                        train_corpus, batch_size=FLAGS.batch_size):
                    # run a single step
                    train_batch(sentences_batch, lstm, train_step, global_step,
                                session, summaries_merged)

                current_step = tf.train.global_step(session, global_step)
                if current_step % FLAGS.checkpoint_every == 0:
                    perplexities = dev_step(
                        get_batches(validation_corpus,
                                    batch_size=FLAGS.batch_size,
                                    do_shuffle=False), lstm, global_step,
                        session)

                    average_perplexity = np.mean(perplexities)
                    model_name = "model_experiment-{}_epoch-{}_val-perplexity-{}".format(
                        experiment_type, epoch + 1, average_perplexity)
                    path = saver.save(session,
                                      os.path.join(checkpoint_dir, model_name))

                    print("Saved model checkpoint to {}".format(path))

                    if best_perplexity is None or best_perplexity > average_perplexity:
                        best_perplexity = average_perplexity
                        best_model = model_name

                print('Done with epoch', epoch + 1)

            if best_model is None:
                raise Exception(
                    "Model has not been saved. Run for at least one epoch")

            print('Restoring best model', best_model)
            saver.restore(session, os.path.join(checkpoint_dir, best_model))

            # evaluate on test set
            perplexities = dev_step(get_batches(test_corpus,
                                                batch_size=FLAGS.batch_size,
                                                do_shuffle=False),
                                    lstm,
                                    global_step,
                                    session,
                                    verbose=0)

            print('Perplexity on test_set is', np.mean(perplexities))

            filename = "group25.perplexity{}".format(experiment_type)
            savefile = os.path.join(save_results_folder, filename)
            print('Saving results to', savefile)

            with open(savefile, 'w') as f:
                f.writelines(str(i) + '\n' for i in perplexities)

            if experiment_type == 'C':
                continuation_sentences = continue_sentences(
                    continuation_corpus, session, lstm, data_processing)

                filename = "group25.continuation"
                savefile = os.path.join(save_results_folder, filename)
                print('Saving results to', savefile)

                with open(savefile, 'w') as f:
                    f.writelines(str(i) + '\n' for i in continuation_sentences)

    print('Done')
Пример #21
0
                                          FLAGS.validation_file,
                                          FLAGS.test_file)

# Build Dictionary
print("Build Dictionary...")
word2id, id2word, user2id, id2user, poi2id, id2poi, post2id, id2post = utils.build_dic(
    train, validation, test)

# Convert Data to Index
print("Converting Data...")
train, validation, test, maximum_document_length = utils.converting(
    train, validation, test, word2id, user2id, poi2id, post2id)

# Load pretrained embedding
print("Load pretrained word embedding...")
_word_embedding = utils.load_embedding(FLAGS.embedding_file, word2id,
                                       FLAGS.wordembedding_dim)

# Load Visual Feature
print("Loading Visual Feature Matrix...")
with open(FLAGS.visual_features) as f:
    _visual_feature = np.load(f)["array"]

# Load visual feature
print("word dict size: " + str(len(word2id)))
print("user dict size: " + str(len(user2id)))
print("poi dict size: " + str(len(poi2id)))
print("Train/Validation/Test: {:d}/{:d}/{:d}".format(len(train),
                                                     len(validation),
                                                     len(test)))
print(
    "=================================================================================="
Пример #22
0

def test(dataloader, model, device):
    print(f'{date()}## Start the testing!')
    start_time = time.perf_counter()
    test_loss = calculate_mse(model, dataloader, device)
    end_time = time.perf_counter()
    print(
        f"{date()}## Test end, test mse is {test_loss:.6f}, time used {end_time - start_time:.0f} seconds."
    )


if __name__ == '__main__':
    config = Config()
    print(f'{date()}## Load word2vec and data...')
    word_emb, word_dict = load_embedding(config.word2vec_file)

    # Train
    train_dataset = MPCNDataset(config.train_file, word_dict, config)
    valid_dataset = MPCNDataset(config.valid_file,
                                word_dict,
                                config,
                                retain_rui=False)
    train_dlr = DataLoader(train_dataset,
                           batch_size=config.batch_size,
                           shuffle=True)
    valid_dlr = DataLoader(valid_dataset, batch_size=config.batch_size)
    os.makedirs(os.path.dirname(config.saved_model),
                exist_ok=True)  # make dir if it isn't exist.
    MPCN_model = MPCN(config, word_emb,
                      fusion_mode=config.fusion_mode).to(config.device)
Пример #23
0
def main(args):
    model_class = get_model_class(args.model)
    model_class.add_config(argparser)
    args = argparser.parse_args()

    args.run_id = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    root_dir = os.path.dirname(os.path.realpath(__file__))
    # only use generated run_path if none provided by user
    if args.run_path is None:
        args.run_path = os.path.join(root_dir, args.run_dir, args.run_id)
    if not os.path.exists(args.run_path):
        os.makedirs(args.run_path)

    global outputManager
    outputManager = OutputManager(args.run_path)

    outputManager.say(args)

    #if not os.path.exists(args.run_dir):
    #    os.makedirs(args.run_dir)
    #assert os.path.isdir(args.run_dir)
    #assert not os.path.exists(args.run_path)
    #os.makedirs(args.run_path)
    outputManager.say("\nRun ID: {}\nRun Path: {}\n\n".format(
        args.run_id, args.run_path))

    train_corpus_path = os.path.dirname(args.train) + "/corpus.tsv.gz"
    train_corpus = Corpus(
        [tuple([train_corpus_path,
                os.path.dirname(args.train)])])
    valid_corpus_path = os.path.dirname(args.eval) + "/corpus.tsv.gz"
    valid_corpus = Corpus(
        [tuple([valid_corpus_path,
                os.path.dirname(args.eval)])])
    outputManager.say("Corpus loaded.\n")

    embs = load_embedding(args.embedding) if args.embedding else None

    embedding_layer = EmbeddingLayer(args.n_d, ['<s>', '</s>'], embs)

    model = model_class(embedding_layer, args)

    if args.cuda:
        model.cuda()
    outputManager.say("\n{}\n\n".format(model))

    outputManager.say(model.state_dict().keys())

    needs_grad = lambda x: x.requires_grad
    optimizer = optim.Adam(filter(needs_grad, model.parameters()), lr=args.lr)
    outputManager.say(optimizer.state_dict())

    if args.load_model:
        outputManager.say("Loading pretrained model")
        model.load_state_dict(torch.load(args.load_model))

    else:
        outputManager.say("Training will begin from scratch")

    best_dev = 0
    iter_cnt = 0

    current_dev = evaluate(iter_cnt, args.eval + "/dev", model, valid_corpus,
                           args)
    evaluate(iter_cnt, args.eval + "/test", model, valid_corpus, args, False)

    for epoch in range(args.max_epoch):
        iter_cnt = train(iter_cnt, model, train_corpus, args, optimizer)
        current_dev = evaluate(iter_cnt, args.eval + "/dev", model,
                               valid_corpus, args)
        if current_dev > best_dev:
            best_dev = current_dev
            evaluate(iter_cnt, args.eval + "/test", model, valid_corpus, args,
                     False)
        outputManager.say("\n")

    if args.save_model:
        torch.save(model.state_dict(), args.save_model)
        torch.save(model, args.save_model + '-complete')
Пример #24
0
def main():
    train_path = os.path.join(data_path, cfg.train_file)
    test_path = os.path.join(data_path, cfg.test_file)
    data_train = pre.load_data(train_path)
    data_test = pre.load_data(test_path)

    word_dict, length_voc = pre.build_voc(data_train[0] + data_test[0])

    emd_vec_path = os.path.join(data_path, cfg.embedding_file)
    emd_word_path = os.path.join(data_path, cfg.embedding_vocab)

    embeddings, vec_dim = pre.load_embedding(emd_vec_path, emd_word_path,
                                             word_dict)

    max_length = max(len(max(data_train[0], key=lambda x: len(x))),
                     len(max(data_test[0], key=lambda x: len(x))))

    cfg.length_voc = length_voc
    cfg.max_length = max_length
    cfg.sentence_vec_dim = vec_dim
    cfg.embedding = embeddings

    train_vec = pre.dataset2id(data_train, word_dict, max_length)
    test_vec = pre.dataset2id(data_test, word_dict, max_length)

    train_batch_manager = pre.Manager_batch(train_vec, cfg.batch_size)
    test_batch_manager = pre.Manager_batch(test_vec, cfg.batch_size)

    with tf.Graph().as_default():
        with tf.name_scope("Train"):
            with tf.variable_scope("Model", reuse=False):
                train_model = Model(cfg, is_Training=True)
        with tf.name_scope("Test"):
            with tf.variable_scope("Model", reuse=True):
                valid_model = Model(cfg, is_Training=False)
            with tf.variable_scope("Model", reuse=True):
                test_model = Model(cfg, is_Training=False)
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True

        save = tf.train.Supervisor(logdir=cfg.save_path,
                                   global_step=train_model.global_steps)

        verbose = False
        with save.managed_session(config=tf_config) as sess:
            logging.info("training.....")
            best_score = 0
            best_f1 = 0
            if cfg.train:
                for epoch in range(cfg.num_epoches):
                    train_iter = train_batch_manager.iter_batch(shuffle=True)
                    test_iter = test_batch_manager.iter_batch(shuffle=False)

                    run_epoch(sess,
                              train_model,
                              train_iter,
                              is_training=True,
                              verbose=verbose)
                    test_acc, f1 = run_epoch(sess,
                                             valid_model,
                                             test_iter,
                                             is_training=False,
                                             verbose=verbose)

                    if test_acc > best_score:
                        best_score = test_acc
                        best_f1 = f1
                        if cfg.save_path:
                            save.saver.save(sess,
                                            cfg.save_path,
                                            global_step=save.global_step)
                    #logging.info('')
                    logging.info(
                        "\033[1;31;40mEpoch: %d   Test: accuracy %.2f%% " %
                        (epoch + 1, test_acc * 100))
                    print("f1:", f1)
                    logging.info("\033[0m")
                logging.info("\033[1;31;40mThe best accuracy score is %.2f%%" %
                             (best_score * 100))
                print("best f1: ", best_f1)
            if cfg.test:
                ckpt = tf.train.get_checkpoint_state(cfg.save_path)
                save.saver.restore(sess, ckpt.model_checkpoint_path)
                test_iter = test_batch_manager.iter_batch(shuffle=False)
                test_acc = evaluation(sess, test_model, test_iter)

                print('accuracy: %.2f%%' % (test_acc * 100))
Пример #25
0
    def __init__(self, prefix="./dataset/EHR", mode="sds"):
        test_filepath = os.path.join(prefix, "test/data_moresymp.txt")

        assert mode in ["sus", "sds", "mix", "pmi", "gpmi"]
        self.mode = mode

        # maps path
        if mode in ["sds", "mix"]:
            self.dise2symp = np.load(os.path.join(prefix, "dise2symp.npy"),
                                     allow_pickle=True).item()
            self.symp2dise = np.load(os.path.join(prefix, "symp2dise.npy"),
                                     allow_pickle=True).item()

        if mode in ["sus", "mix"]:
            self.user2symp = np.load(os.path.join(prefix, "user2symp.npy"),
                                     allow_pickle=True).item()
            self.symp2user = np.load(os.path.join(prefix, "symp2user.npy"),
                                     allow_pickle=True).item()

        # load embeddings
        self.symp_embs, self.dise_embs = load_embedding("ckpt/GNN.pt")
        self.num_symp = self.symp_embs.shape[0]

        if mode in ["pmi", "gpmi"]:
            # init a PMI matrix that has shape M X M
            # we'd better make it a sparse matrix
            # if we pick the graphPMI (gpmi) method, we need an additional S-D PMI matrix.

            # read data
            self.pmi_ss_path = os.path.join(prefix, "pmi_ss_mat.npz")
            self.pmi_sd_path = os.path.join(prefix, "pmi_sd_mat.npz")
            self.symp_count_path = os.path.join(prefix, "sympcount.npy")
            self.dise_count_path = os.path.join(prefix, "disecount.npy")
            self.dise2symp_path = os.path.join(prefix, "dise2symp.npy")
            if os.path.exists(self.pmi_ss_path):
                print("Load PMI Mat from", self.pmi_ss_path)
                self.symp2symp = sparse.load_npz(self.pmi_ss_path)
                # self.symp2dise = sparse.load_npz(self.pmi_sd_path)
                self.symp2dise = np.load(os.path.join(prefix, "symp2dise.npy"),
                                         allow_pickle=True).item()

                self.sympcount = np.load(self.symp_count_path,
                                         allow_pickle=True).item()
                self.disecount = np.load(self.dise_count_path,
                                         allow_pickle=True).item()
                self.symp2symp.setdiag(0)
            else:
                print("Build PMI Mat.")
                self.build_pmi_matrix(prefix)
                self.symp2symp.setdiag(0)

            # build symp count array
            c_ar, i_ar = [], []
            for k, v in self.sympcount.items():
                c_ar.append(v)
                i_ar.append(int(k))
            sympcount_mat = sparse.csr_matrix((c_ar, (i_ar, [0] * len(i_ar))))
            self.sympcount_ar = sympcount_mat.toarray().flatten()
            self.num_all_symp = self.sympcount_ar.sum()

            # build dise count array
            c_ar, i_ar = [], []
            for k, v in self.disecount.items():
                c_ar.append(v)
                i_ar.append(int(k))
            disecount_mat = sparse.csr_matrix((c_ar, (i_ar, [0] * len(i_ar))))
            self.disecount_ar = disecount_mat.toarray().flatten()
            self.num_all_dise = self.disecount_ar.sum()

            self.dise2symp = np.load(self.dise2symp_path,
                                     allow_pickle=True).item()
parser.add_argument('--nflips', type=int, default=0, help='number of flips')
parser.add_argument('--temperature', type=float, default=.8, help='RNN temperature')
parser.add_argument('--lr', type=float, default=0.0001, help='learning rate, default=0.0001')
parser.add_argument('--warm-start', action='store_true')
args = parser.parse_args()
batch_size = args.batch_size

# set sample sizes
nb_train_samples = np.int(np.floor(args.nsamples / batch_size)) * batch_size  # num training samples
nb_val_samples = nb_train_samples  # num validation samples

# seed weight initialization
random.seed(seed)
np.random.seed(seed)

embedding, idx2word, word2idx, glove_idx2idx = load_embedding(nb_unknown_words)
vocab_size, embedding_size = embedding.shape
oov0 = vocab_size - nb_unknown_words
idx2word = process_vocab(idx2word, vocab_size, oov0, nb_unknown_words)
X_train, X_test, Y_train, Y_test = load_split_data(nb_val_samples, seed)

# print a sample recipe to make sure everything looks right
print('Random head, description:')
i = 811
prt('H', Y_train[i], idx2word)
prt('D', X_train[i], idx2word)

# save model initialization parameters
model_params = (dict(
    vocab_size=vocab_size,
    embedding_size=embedding_size,
Пример #27
0
def train():
    '''
    Train function
    '''
    args = get_args()

    # Load data
    dataset = SemEvalDataset(args.train_filename, max_len=args.seq_len)
    dataloader = DataLoader(dataset,
                            args.batch_size,
                            True,
                            num_workers=args.num_workers)
    dataset_val = SemEvalDataset(args.test_filename,
                                 max_len=args.seq_len,
                                 d=(dataset.d, dataset.rel_d))
    dataloader_val = DataLoader(dataset_val,
                                args.batch_size,
                                True,
                                num_workers=args.num_workers)

    args.word_embedding = load_embedding(args.embedding_filename,
                                         args.embedding_wordlist_filename,
                                         dataset.d)
    args.vac_len_word = len(dataset.d.word2id)
    args.vac_len_rel = len(dataset.rel_d.word2id)
    args.dw = args.word_embedding.shape[1]

    for arg in vars(args):
        print("{} = {}".format(arg, getattr(args, arg)))

    # Build models
    writer = SummaryWriter()
    model = CNN(args)
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    best_eval_acc = 0.
    best_eval_f1 = 0.

    for i in range(args.nepoch):
        # Training
        total_loss = 0.
        total_acc = 0.
        total_f1 = 0.

        ntrain_batch = 0
        model.train()
        for (seq, e1, e2, dist1, dist2, r) in dataloader:
            ntrain_batch += 1
            seq = Variable(seq)
            e1 = Variable(e1)
            e2 = Variable(e2)
            dist1 = Variable(dist1)
            dist2 = Variable(dist2)
            r = Variable(r)
            r = r.view(r.size(0))

            pred = model(seq, dist1, dist2, e1, e2)
            l = loss_func(pred, r)
            acc = accuracy(pred, r)
            f1 = F1(pred, r)
            total_acc += acc
            total_f1 += f1
            total_loss += l.item()

            optimizer.zero_grad()
            l.backward()
            optimizer.step()

        writer.add_scalar('train/loss', l.item(), i)
        writer.add_scalar('train/accuracy', total_acc / ntrain_batch, i)
        writer.add_scalar('train/f1', total_f1 / ntrain_batch, i)

        print("Epoch: {}, Training loss : {:.4}, acc: {:.4}, f1: {:.4}".format(
            i, total_loss / ntrain_batch, total_acc / ntrain_batch,
            total_f1 / ntrain_batch))

        # Evaluation
        if i % args.eval_every == args.eval_every - 1:
            val_total_acc = 0.
            val_total_f1 = 0.

            nval_batch = 0
            model.eval()
            for (seq, e1, e2, dist1, dist2, r) in dataloader_val:
                nval_batch += 1
                seq = Variable(seq)
                e1 = Variable(e1)
                e2 = Variable(e2)
                dist1 = Variable(dist1)
                dist2 = Variable(dist2)
                r = Variable(r)
                r = r.view(r.size(0))

                pred = model(seq, dist1, dist2, e1, e2)
                acc = accuracy(pred, r)
                f1 = F1(pred, r)
                val_total_acc += acc
                val_total_f1 += f1
            best_eval_acc = max(best_eval_acc, val_total_acc / nval_batch)

            # Write the stats to tensorboard
            writer.add_scalar('test/accuracy', val_total_acc / nval_batch, i)
            writer.add_scalar('test/F1', val_total_f1 / nval_batch, i)
            print("Epoch: {}, Val acc: {:.4f}, F1: {:.4f}".format(
                i, val_total_acc / nval_batch, val_total_f1 / nval_batch))
    print('Best acc: {}'.format(best_eval_acc))
    print('Best F1: {}'.format(best_eval_f1))
    torch.save(model.state_dict(), args.model_file)
    writer.close()
Пример #28
0
    else:
        torch.set_default_tensor_type(torch.FloatTensor)
    torch.set_printoptions(precision=9)
    torch.set_num_threads(1)

    # Load command line options
    options = vars(args)
    writer.add_text('Text', 'Hyper-parameters: {}'.format(options), 0)

    # Load supervision pairs and convert to dict
    f_supervision = options["supervision_file"]
    train_hyper2hypo, train_hypo2hyper = load_directional_supervision(f_supervision)

    # Load embedding files and word <-> index map
    f_embed = options["embed_file"]
    embedding, index2word, word2index, vocab_size, embed_dim = load_embedding(f_embed)
    print("=== Finish loading embedding ===")
    options["embedding"] = embedding
    options["index2word"] = index2word
    options["word2index"] = word2index
    options["vocabSize"] = vocab_size
    options["embedSize"] = embed_dim

    # Construct training set and training data loader
    if options["use_pair_feature"]:
        print("!!! Using pair features")
        f_pair_feature_key = options["pair_feature_prefix"] + "edge.keys3.tsv"
        f_pair_feature_value = options["pair_feature_prefix"] + "edge.values3.scaled.npy"
        pair_features = load_pair_features(f_pair_feature_key, f_pair_feature_value)
        train_data = DirectionalTripletsWithPairFeature(options["embedding"], train_hyper2hypo, pair_features)
    else:
def main(config, training_set, testing_set):
    training_set.set_preprocess_fn(scheduler_preprocess)
    training_set.set_special_tokens(['<pad>', '<unk>'])
    testing_set.set_preprocess_fn(scheduler_preprocess)
    testing_set.set_special_tokens(['<pad>', '<unk>'])

    scheduler_model = VanillaSeq2SeqEncoder(config.batch_size, config.vocab_size, config.embedding_size, config.hidden_size)
    _ = scheduler_model()
    scheduler_model.optimize(config.learning_rate)

    tf.summary.scalar("cost", scheduler_model.mse)

    nthreads_intra = config.nthreads // 2
    nthreads_inter = config.nthreads - config.nthreads // 2

    with tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=nthreads_inter,
                                          intra_op_parallelism_threads=nthreads_intra)) as sess:
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        writer = tf.summary.FileWriter('./logs/' + timestamp + '/train/', sess.graph)
        test_writer = tf.summary.FileWriter('./logs/' + timestamp + '/test/', sess.graph)
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())

        # Load word2vec pretrained embeddings
        load_embedding(sess, training_set.word_to_index, scheduler_model.word_embeddings, config.embedding_path,
                       config.embedding_size, config.vocab_size)

        for epoch in range(config.n_epochs):
            if not epoch % config.test_every:
                # Testing phase
                success = 0
                total = 0
                for k in range(0, len(testing_set), config.batch_size):
                    if k + config.batch_size < len(testing_set):
                        batch_endings1, batch_endings2, correct_ending = testing_set.get(k, config.batch_size,
                                                                                         random=True)
                        total += config.batch_size
                        shuffled_batch1, labels1 = scheduler_get_labels(batch_endings1)
                        shuffled_batch2, labels2 = scheduler_get_labels(batch_endings2)
                        probabilities1 = sess.run(
                            'scheduler/order_probability:0',
                            {'scheduler/x:0': shuffled_batch1,
                             'scheduler/optimize/label:0': labels1})
                        probabilities2 = sess.run(
                            'scheduler/order_probability:0',
                            {'scheduler/x:0': shuffled_batch2,
                             'scheduler/optimize/label:0': labels2})
                        for b in range(config.batch_size):
                            if probabilities1[b][np.where(labels1[b] == 1)[0][0]] > probabilities2[b][np.where(labels2[b] == 1)[0][0]]:
                                if correct_ending[b] == 0:
                                    success += 1
                            else:
                                if correct_ending[b] == 1:
                                    success += 1
                accuracy = float(success) / float(total)
                accuracy_summary = tf.Summary()
                accuracy_summary.value.add(tag='accuracy', simple_value=accuracy)
                test_writer.add_summary(accuracy_summary, epoch)
            for k in range(0, len(training_set), config.batch_size):
                if k + config.batch_size < len(training_set):
                    summary_op = tf.summary.merge_all()

                    batch = training_set.get(k, config.batch_size, random=True)
                    shuffled_batch, labels = scheduler_get_labels(batch)
                    probabilities, _, computed_mse, summary = sess.run(
                        ['scheduler/order_probability:0', 'scheduler/optimize/optimizer',
                         'scheduler/optimize/mse:0', summary_op],
                        {'scheduler/x:0': shuffled_batch,
                         'scheduler/optimize/label:0': labels})
                    writer.add_summary(summary, epoch * len(training_set) + k)
                    if not epoch % config.save_model_every:
                        model_path = './builds/' + timestamp
                        saver.save(sess, model_path, global_step=epoch)
            training_set.shuffle_lines()
            if not epoch % config.save_model_every:
                model_path = './builds/' + timestamp + '/model'
                saver.save(sess, model_path, global_step=epoch)
Пример #30
0
vec_size_alpha = 300
runs = 2

np.random.seed(42)
seeds = np.random.permutation(list(range(runs)))

reference_p = ['0', 'u']

plot = 0
#alphas = np.concatenate([np.arange(-10,-3,0.5),np.arange(-3, 3, 0.1), np.arange(3,10,0.5)])
stats = 0

#############################################

if plot == 0 and stats == 0:
    D_glove, V_glove, D_pretrained_glove, V_pretrained_glove = load_embedding()


# https://gitlab.com/praj88/deepsentiment/blob/master/train_code/utility_functions.py
# Combine and split the data into train and test
def read_senti(path):
    # read dictionary into df
    df_data_sentence = pd.read_table(path + 'dictionary.txt')
    df_data_sentence_processed = df_data_sentence['Phrase|Index'].str.split(
        '|', expand=True)
    df_data_sentence_processed = df_data_sentence_processed.rename(
        columns={
            0: 'Phrase',
            1: 'phrase_ids'
        })