示例#1
0
def get_model(args):
    if args.model_type == 'baseline':
        m = BaselineModel(num_layers=args.num_layers,
                          num_channels=args.num_channels)
    elif args.model_type == 'parallel':
        m = ParallelDilationModel(num_channels=args.num_channels)
    elif args.model_type == 'parallel_aggregate':
        m = ParallelAggregationModel(num_channels=args.num_channels)
    elif args.model_type == 'parallel_aggregate_test':
        m = ParallelAggregationModelTest(num_channels=args.num_channels)
    elif args.model_type == 'parallel_aggregate_4_test':
        m = ParallelAggregationModel4ColTest(num_channels=args.num_channels)
    elif args.model_type == 'parallel_aggregate_3_test':
        m = ParallelAggregationModel3ColTest(num_channels=args.num_channels)
    elif args.model_type == 'parallel_aggregate_2_test':
        m = ParallelAggregationModel2ColTest(num_channels=args.num_channels)
    elif args.model_type == 'parallel_aggregate_1_test':
        m = ParallelAggregationModel1ColTest(num_channels=args.num_channels)
    elif args.model_type == 'parallel_noaggregate_test':
        m = ParallelNoAggregationModelTest(num_channels=args.num_channels)
    elif args.model_type == 'parallel_noaggregate_4_test':
        m = ParallelNoAggregationModel4ColTest(num_channels=args.num_channels)
    elif args.model_type == 'parallel_noaggregate_3_test':
        m = ParallelNoAggregationModel3ColTest(num_channels=args.num_channels)
    elif args.model_type == 'parallel_noaggregate_2_test':
        m = ParallelNoAggregationModel2ColTest(num_channels=args.num_channels)
    elif args.model_type == 'parallel_noaggregate_1_test':
        m = ParallelNoAggregationModel1ColTest(num_channels=args.num_channels)
    else:
        raise NameError('unknown model type: {}'.format(args.model_type))
    return m
示例#2
0
def get_network(network_type):
    '''
    returns an instance of a model with randomly initialized weights
    '''
    dataset_dict = data_loader_factory.get_dataset_dictionary(config.DATALOADER_TYPE)
    if network_type == ModelType.BASELINE:
        return BaselineModel(dataset_dict)
    elif network_type == ModelType.RELATION_NETWORK:
        return RelNet(dataset_dict, config.RELATION_NETWORK_DICTIONARY)
    elif network_type == ModelType.FILM:
        return FiLM(dataset_dict)
    elif network_type == ModelType.MOD_FILM:
        return mod_FiLM(dataset_dict)
    elif network_type == ModelType.BN_MOD_FILM:
        return bn_mod_FiLM(dataset_dict)
    elif network_type == ModelType.STACKED_CO_ATTENTION:
        pass
    elif network_type == ModelType.MEMORY_NETWORK:
        pass
    elif network_type == ModelType.RELATION_GROUP_ATTENTION_STANDARD:
        return RelNetGroupAttentionStandard(dataset_dict, config.RELATION_GROUP_ATTENTION_NETWORK_DICTIONARY)
    elif network_type == ModelType.RELATION_GROUP_ATTENTION_ALTERNATE:
        return RelNetGroupAttentionAlternate(dataset_dict, config.RELATION_GROUP_ATTENTION_NETWORK_DICTIONARY)
    elif network_type == ModelType.RELATION_GROUP_ATTENTION_SELF:
        return RelNetGroupAttentionSelf(dataset_dict, config.RELATION_GROUP_ATTENTION_NETWORK_DICTIONARY)
    elif network_type == ModelType.RELATION_NETWORK_BATCH_NORM:
        return RelNetBatchNorm(dataset_dict, config.RELATION_NETWORK_DICTIONARY)
    elif network_type == ModelType.RELATION_NETWORK_CONV_ATTENTION:
        return RelNetConvAttention(dataset_dict, config.RELATION_NETWORK_DICTIONARY)
示例#3
0
def baseline(percent):
    X, Y, X_train, Y_train, answers = split_data()
    reg = BaselineModel(X, X_train, Y_train)

    preds_val = reg.predict(X)

    ids = list(range(1, 190))
    pairs = []
    for i in ids:
        pairs.append((i, preds_val[i - 1]))

    roomsNotDone = [x for x in pairs if x[0] not in elizabeth_known]
    length = len(roomsNotDone)
    percentage = percent / 100.0
    threshold = sorted(roomsNotDone, key=lambda x: x[1])[int(
        (length - 1) * percentage)][1]
    preds = []
    for i in sorted(pairs, key=lambda x: x[0]):
        if i[1] >= threshold:
            preds.append(1)
        else:
            preds.append(0)

    roomsDone = [x for x in pairs if x[0] in elizabeth_known]
    known_preds = []
    limited_answers = []
    for i in sorted(roomsDone, key=lambda x: x[0]):
        limited_answers.append(Y[i[0] - 1])
        known_preds.append(i[1])

    rms = sqrt(mean_squared_error(limited_answers, known_preds))
    print("rms: ", rms)
    mae = mean_absolute_error(limited_answers, known_preds)
    print("mae: ", mae)

    stats = check_answers(preds, answers)
示例#4
0
        window_size = args.window_size  # in ms
        step_size = args.step_size
        n_input = int(1e-3 * window_size * 16000 / 2 + 1)
        n_output = n_input

        if args.model_type == 'residual':
            model = ResidualModel(n_input, args.num_blocks, args.num_hidden,
                                  args.num_layers_per_block).cuda()
        elif args.model_type == 'highway':
            model = HighwayModel(n_input, args.num_blocks, args.num_hidden,
                                 args.num_layers_per_block).cuda()
        elif args.model_type == 'masking':
            model = MaskingModel(n_input, args.num_blocks, args.num_hidden,
                                 args.num_layers_per_block).cuda()
        elif args.model_type == 'baseline':
            model = BaselineModel(n_input, args.num_hidden,
                                  args.num_layers_per_block).cuda()
        else:
            raise ValueError(
                'model_type has to be either "residual", "highway", or "baseline"'
            )

        print(model)

        criterion = torch.nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

        if args.data_type == 'reverb':
            print('Loading reverb dataset')
            G_train, G_val = load_data(window_size, step_size, args.use_log)
        elif args.data_type == 'noisy':
            print('Loading noisy dataset')
示例#5
0
def test(args):

    label_name = [
        'not related or not informative', 'other useful information',
        'donations and volunteering', 'affected individuals',
        'sympathy and support', 'infrastructure and utilities damage',
        'caution and advice'
    ]

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")

    print('load best model...')
    model = BaselineModel.load(args['--model-path'], device)
    model.to(device)

    model.eval()

    df_test = pd.read_csv(args['--test'], index_col=0)

    cn_loss = torch.load('loss_func')
    sents = [text.split(' ') for text in df_test.ProcessedText]

    with torch.no_grad():
        pre_softmax = model(sents)
        loss = cn_loss(
            pre_softmax,
            torch.tensor(df_test.InformationType_label.values,
                         dtype=torch.long,
                         device=device))

        softmax = torch.nn.Softmax(dim=1)
        prob = softmax(pre_softmax)
        prediction = [t.item() for t in list(torch.argmax(prob, dim=1))]

    pickle.dump([label_name[i] for i in prediction],
                open('test_prediction', 'wb'))

    accuracy = accuracy_score(df_test.InformationType_label.values, prediction)
    matthews = matthews_corrcoef(df_test.InformationType_label.values,
                                 prediction)

    precisions = {}
    recalls = {}
    f1s = {}
    aucrocs = {}

    for i in range(len(label_name)):
        prediction_ = [1 if pred == i else 0 for pred in prediction]
        true_ = [
            1 if label == i else 0
            for label in df_test.InformationType_label.values
        ]
        f1s.update({label_name[i]: f1_score(true_, prediction_)})
        precisions.update({label_name[i]: precision_score(true_, prediction_)})
        recalls.update({label_name[i]: recall_score(true_, prediction_)})
        aucrocs.update({
            label_name[i]:
            roc_auc_score(true_, list(t.item() for t in prob[:, i]))
        })

    metrics_dict = {
        'loss': loss,
        'accuracy': accuracy,
        'matthews coef': matthews,
        'precision': precisions,
        'recall': recalls,
        'f1': f1s,
        'aucroc': aucrocs
    }

    pickle.dump(metrics_dict, open('evaluation_metrics', 'wb'))

    cm = plot_confusion_matrix(list(df_test.InformationType_label.values),
                               prediction,
                               label_name,
                               normalize=False,
                               path='test_confusion_matrix',
                               title='confusion matrix for test dataset')
    plt.savefig('test_confusion_matrix', format='png')
    cm_norm = plot_confusion_matrix(
        list(df_test.InformationType_label.values),
        prediction,
        label_name,
        normalize=True,
        path='test normalized_confusion_matrix',
        title='normalized confusion matrix for test dataset')
    plt.savefig('test_normalized_confusion_matrix', format='png')

    if args['--verbose']:
        print('loss: %.2f' % loss)
        print('accuracy: %.2f' % accuracy)
        print('matthews coef: %.2f' % matthews)
        for i in range(len(label_name)):
            print('precision score for %s: %.2f' %
                  (label_name[i], precisions[label_name[i]]))
            print('recall score for %s: %.2f' %
                  (label_name[i], recalls[label_name[i]]))
            print('f1 score for %s: %.2f' %
                  (label_name[i], f1s[label_name[i]]))
            print('auc roc score for %s: %.2f' %
                  (label_name[i], aucrocs[label_name[i]]))
示例#6
0
def train(args):

    label_name = ['0', '1']

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")

    start_time = time.time()
    print('Initializing Glove vocab and embeddings...', file=sys.stderr)
    glove_word2id = pickle.load(open(args['--vocab'], 'rb'))
    glove_word2id.update({'<unk>': len(glove_word2id)})
    glove_word2id.update({'<pad>': len(glove_word2id)})
    vocab = VocabEntry(glove_word2id)

    embedding_matrix = np.load(open(args['--embeddings'], 'rb'))
    embedding_matrix = np.vstack(
        (embedding_matrix,
         np.random.uniform(embedding_matrix.min(), embedding_matrix.max(),
                           (2, embedding_matrix.shape[1]))))
    glove_embeddings = torch.tensor(embedding_matrix,
                                    dtype=torch.float,
                                    device=device)
    print('Done! time elapsed %.2f sec' % (time.time() - start_time),
          file=sys.stderr)
    print('-' * 80, file=sys.stderr)

    start_time = time.time()
    print('Importing data...', file=sys.stderr)
    df_train = pd.read_csv(args['--train'], index_col=0)
    df_val = pd.read_csv(args['--dev'], index_col=0)
    train_label = dict(df_train.InformationType_label.value_counts())
    label_max = float(max(train_label.values()))
    train_label_weight = torch.tensor(
        [label_max / train_label[i] for i in range(len(train_label))],
        device=device)
    print('Done! time elapsed %.2f sec' % (time.time() - start_time),
          file=sys.stderr)
    print('-' * 80, file=sys.stderr)

    start_time = time.time()
    print('Set up model...', file=sys.stderr)

    model = BaselineModel(hidden_size=int(args['--hidden-size']),
                          embedding=glove_embeddings,
                          vocab=vocab,
                          n_class=len(label_name),
                          dropout_rate=float(args['--dropout']))
    model = model.to(device)
    print('Use device: %s' % device, file=sys.stderr)
    print('Done! time elapsed %.2f sec' % (time.time() - start_time),
          file=sys.stderr)
    print('-' * 80, file=sys.stderr)

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))
    cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight.float(),
                                        reduction='mean')
    torch.save(cn_loss, 'loss_func')  # for later testing

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = 0
    cum_examples = report_examples = epoch = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('Begin Maximum Likelihood training...')

    while True:
        epoch += 1

        for sents, targets in batch_iter(df_train,
                                         batch_size=train_batch_size,
                                         shuffle=True):  # for each epoch
            train_iter += 1

            optimizer.zero_grad()

            batch_size = len(sents)

            pre_softmax = model(sents)
            print(type(targets[0]))
            loss = cn_loss(
                pre_softmax,
                torch.tensor(targets, dtype=torch.long, device=device))

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            optimizer.step()

            batch_losses_val = loss.item() * batch_size
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, '
                      'cum. examples %d, speed %.2f examples/sec, '
                      'time elapsed %.2f sec' %
                      (epoch, train_iter, report_loss / report_examples,
                       cum_examples, report_examples /
                       (time.time() - train_time), time.time() - begin_time),
                      file=sys.stderr)

                train_time = time.time()
                report_loss = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. examples %d' %
                    (epoch, train_iter, cum_loss / cum_examples, cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = 0.

                print('begin validation ...', file=sys.stderr)

                validation_loss = validation(
                    model, df_val, cn_loss,
                    device)  # dev batch size can be a bit larger

                print('validation: iter %d, loss %f' %
                      (train_iter, validation_loss),
                      file=sys.stderr)

                is_better = len(
                    hist_valid_scores
                ) == 0 or validation_loss < min(hist_valid_scores)
                hist_valid_scores.append(validation_loss)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)

                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(
                            args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

                if epoch == int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    exit(0)
示例#7
0
def mainFunc(argv):
    def printUsage():
        print('main.py -n <num_cores> -x <experiment>')
        print(
            'num_cores = Number of cores requested from the cluster. Set to -1 to leave unset'
        )
        print(
            'experiment = experiment setup that should be executed. e.g \'baseline\' or \'attention\''
        )
        print(
            'tag = optional tag or name to distinguish the runs, e.g. \'bidirect3layers\' '
        )

    num_cores = -1
    experiment = ""
    tag = None
    # Command line argument handling
    try:
        opts, args = getopt.getopt(argv, "n:x:t:",
                                   ["num_cores=", "experiment=", "tag="])
    except getopt.GetoptError:
        printUsage()
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            printUsage()
            sys.exit()
        elif opt in ("-n", "--num_cores"):
            num_cores = int(arg)
        elif opt in ("-x", "--experiment"):
            if arg in ("baseline", "attention"):
                experiment = arg
            else:
                printUsage()
                sys.exit(2)
        elif opt in ("-t", "--tag"):
            tag = arg

    print("Executing experiment {} with {} CPU cores".format(
        experiment, num_cores))
    if num_cores != -1:
        # We set the op_parallelism_threads in the ConfigProto and pass it to the TensorFlow session
        configProto = tf.ConfigProto(inter_op_parallelism_threads=num_cores,
                                     intra_op_parallelism_threads=num_cores)
    else:
        configProto = tf.ConfigProto()

    print("Initializing model")
    model = None
    if experiment == "baseline":
        model = BaselineModel(vocab_size=conf.vocabulary_size,
                              embedding_size=conf.word_embedding_size,
                              bidirectional=conf.bidirectional_encoder,
                              attention=False,
                              dropout=conf.use_dropout,
                              num_layers=conf.num_layers,
                              is_training=True)

    elif experiment == "attention":
        model = BaselineModel(vocab_size=conf.vocabulary_size,
                              embedding_size=conf.word_embedding_size,
                              bidirectional=conf.bidirectional_encoder,
                              attention=True,
                              dropout=conf.use_dropout,
                              num_layers=conf.num_layers,
                              is_training=True)
    assert model != None
    print("=== GETTING DATA BY TYPE = TRAIN ===")
    enc_inputs, dec_inputs, word_2_index, index_2_word = get_data_by_type(
        'train')
    print("***********")
    print("Encoder inputs length {}".format(len(enc_inputs)))
    print("Decoder inputs length {}".format(len(dec_inputs)))
    print("***********")
    # Materialize validation data
    print("=== GETTING DATA BY TYPE = EVAL ===")
    validation_enc_inputs, validation_dec_inputs, _, _ = get_data_by_type(
        'eval')
    validation_data = list(
        bucket_by_sequence_length(validation_enc_inputs,
                                  validation_dec_inputs,
                                  conf.batch_size,
                                  filter_long_sent=False))

    print("Starting TensorFlow session")
    with tf.Session(config=configProto) as sess:
        global_step = 1

        saver = tf.train.Saver(max_to_keep=3, keep_checkpoint_every_n_hours=4)

        # Init Tensorboard summaries. This will save Tensorboard information into a different folder at each run.
        timestamp = '{0:%Y-%m-%d_%H-%M-%S}'.format(datetime.datetime.now())
        tag_string = ""
        if tag is not None:
            tag_string = "-" + tag
        train_logfolderPath = os.path.join(
            conf.log_directory,
            "{}{}-training-{}".format(experiment, tag_string, timestamp))
        train_writer = tf.summary.FileWriter(train_logfolderPath,
                                             graph=tf.get_default_graph())
        validation_writer = tf.summary.FileWriter(
            "{}{}{}-validation-{}".format(conf.log_directory, experiment,
                                          tag_string, timestamp),
            graph=tf.get_default_graph())

        copy_config(train_logfolderPath
                    )  # Copies the current config.py to the log directory
        sess.run(tf.global_variables_initializer())

        if conf.use_word2vec:
            print("Using word2vec embeddings")
            if not os.path.isfile(conf.word2vec_path):
                train_sentences = TRAINING_FILEPATH
                train_embeddings(save_to_path=conf.word2vec_path,
                                 embedding_size=conf.word_embedding_size,
                                 minimal_frequency=conf.word2vec_min_word_freq,
                                 train_tuples_path=train_sentences,
                                 validation_path=None,
                                 num_workers=conf.word2vec_workers_count)
            print("Loading word2vec embeddings")
            load_embedding(sess, get_or_create_vocabulary(),
                           model.embedding_matrix, conf.word2vec_path,
                           conf.word_embedding_size, conf.vocabulary_size)
        sess.graph.finalize()
        print("Starting training")
        for i in range(conf.num_epochs):
            print("Training epoch {}".format(i))
            for data_batch, data_sentence_lengths, label_inputs_batch, label_targets_batch, label_sentence_lengths in tqdm(
                    bucket_by_sequence_length(enc_inputs, dec_inputs,
                                              conf.batch_size),
                    total=ceil(len(enc_inputs) / conf.batch_size)):
                feed_dict = model.make_train_inputs(data_batch,
                                                    data_sentence_lengths,
                                                    label_inputs_batch,
                                                    label_targets_batch,
                                                    label_sentence_lengths)
                run_options = None
                run_metadata = None
                if global_step % conf.trace_frequency == 0:
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                _, train_summary = sess.run([model.train_op, model.summary_op],
                                            feed_dict,
                                            options=run_options,
                                            run_metadata=run_metadata)
                if global_step % conf.trace_frequency == 0:
                    train_writer.add_run_metadata(run_metadata,
                                                  "step{}".format(global_step))
                train_writer.add_summary(train_summary, global_step)

                if global_step % conf.validation_summary_frequency == 0:  #
                    # Randomly choose a batch from the validation dataset and use it for loss calculation
                    vali_data_batch, vali_data_sentence_lengths, vali_label_inputs_batch, vali_label_targets_batch, vali_label_sentence_lengths = choice(
                        validation_data)
                    validation_feed_dict = model.make_train_inputs(
                        vali_data_batch,
                        vali_data_sentence_lengths,
                        vali_label_inputs_batch,
                        vali_label_targets_batch,
                        vali_label_sentence_lengths,
                        keep_prob=1.0)
                    validation_summary = sess.run(model.validation_summary_op,
                                                  validation_feed_dict)
                    validation_writer.add_summary(validation_summary,
                                                  global_step)

                if global_step % conf.checkpoint_frequency == 0:
                    saver.save(sess,
                               os.path.join(
                                   train_logfolderPath,
                                   "{}{}-{}-ep{}.ckpt".format(
                                       experiment, tag_string, timestamp, i)),
                               global_step=global_step)
                global_step += 1

        saver.save(
            sess,
            os.path.join(
                train_logfolderPath,
                "{}{}-{}-ep{}-final.ckpt".format(experiment, tag_string,
                                                 timestamp, conf.num_epochs)))
        print("Done with training for {} epochs".format(conf.num_epochs))
示例#8
0
    if args.load:
        model = torch.load(args.load, map_location=device)

        vocab = model.vocab

        test_source, _test_target = read_data(args.test)
        test_source = index_data(test_source, vocab)
        for i, source in enumerate(batchify_data(test_source)):
            output = model.decode(source)
            for words in output:
                print(' '.join(words))
        exit(0)

    if args.model == 'baseline':
        model = BaselineModel(vocab).to(device)
    elif args.model == 'transformer':
        model = TransformerModel(vocab).to(device)
    else:
        print('error: invalid model or model not specified (--model)',
              file=sys.stderr)
        sys.exit()

    for p in model.parameters():
        if p.dim() > 1:
            torch.nn.init.xavier_uniform_(p)

    criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id)
    lr = 5  # learning rate
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
示例#9
0
def mainFunc(argv):
    def printUsage():
        print('perplexity.py -n <num_cores> -x <experiment> -i <input file> -c <checkpoint>')
        print('num_cores = Number of cores requested from the cluster. Set to -1 to leave unset')
        print('experiment = experiment setup that should be executed. e.g \'baseline\'')
        print('input = what dialogs to predict from. e.g \'./Dialog_Triples.txt\'')
        print('checkpoint = Path to the checkpoint to load parameters from. e.g. \'./logs/baseline-ep4-500\'')
        

    def maptoword(sentence):
        return " ".join(map(lambda x: index_2_word[x], sentence)) + '\n'

    num_cores = -1
    experiment = ""
    checkpoint_filepath = ""
    input_filepath = ""
    # Command line argument handling
    try:
        opts, args = getopt.getopt(argv, "n:x:c:i:", ["num_cores=", "experiment=", "checkpoint=", "input="])
    except getopt.GetoptError:
        printUsage()
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            printUsage()
            sys.exit()
        elif opt in ("-n", "--num_cores"):
            num_cores = int(arg)
        elif opt in ("-x", "--experiment"):
            if arg in ("baseline", "attention"):
                experiment = arg
            elif arg in ("attention"):
                experiment = arg
            else:
                printUsage()
                sys.exit(2) 
        elif opt in ("-i", "--input"):
            if arg != "":
                input_filepath = arg
            else:
                printUsage()
                sys.exit(2)
        elif opt in ("-c", "--checkpoint"):
            if arg != "":
                checkpoint_filepath = arg
            else:
                printUsage()
                sys.exit(2)

    if num_cores != -1:
        # We set the op_parallelism_threads in the ConfigProto and pass it to the TensorFlow session
        configProto = tf.ConfigProto(inter_op_parallelism_threads=num_cores,
                                     intra_op_parallelism_threads=num_cores)
    else:
        configProto = tf.ConfigProto()

    model = None
    if experiment == "baseline":
        model = BaselineModel(vocab_size=conf.vocabulary_size,
                              embedding_size=conf.word_embedding_size,
                              bidirectional=conf.bidirectional_encoder,
                              attention=False,
                              dropout=conf.use_dropout,
                              num_layers=conf.num_layers,
                              is_training=False)

    elif experiment == "attention":
        model = BaselineModel(vocab_size=conf.vocabulary_size,
                              embedding_size=conf.word_embedding_size,
                              bidirectional=conf.bidirectional_encoder,
                              attention=True,
                              dropout=conf.use_dropout,
                              num_layers=conf.num_layers,
                              is_training=False)

    assert model != None

    with tf.Session(config=configProto) as sess:
        global_step = 1

        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, checkpoint_filepath)

        triples_to_tuples(input_filepath, testing_path)
        w2i, _ = get_w2i_i2w_dicts()
        vocabulary = get_vocabulary()
        enc_inputs, dec_inputs = apply_w2i_to_corpus_tuples(load_testing_tuples(), vocabulary, w2i)

        
        validation_input_lengths = set(map(lambda x: len(x), enc_inputs))
        lm_logits_dict = construct_lm_logits(sess, model, validation_input_lengths)
        
        is_first_tuple = True
        pplf = open("perplexities_attention_antilm.out", 'w')
        for data_batch, data_sentence_lengths, label_inputs_batch, label_targets_batch, label_sentence_lengths in bucket_by_sequence_length(enc_inputs, dec_inputs, conf.batch_size, sort_data=False, shuffle_batches=False, filter_long_sent=False):
            lm_logits_batch = construct_lm_logits_batch(lm_logits_dict, data_sentence_lengths)
            feed_dict = model.make_inference_inputs(data_batch, data_sentence_lengths, lm_logits_batch)

            softmax_predictions = sess.run(model.decoder_softmax_prediction, feed_dict)

            # Perplexity calculation
            for sentID in range(len(label_sentence_lengths)): # Loop 
                word_probs = []
                # As long as we havent reached either end of the target or predicted sentences
                word_index = 0
                while word_index < label_sentence_lengths[sentID] and word_index < softmax_predictions.shape[1]:
                    ground_truth_word_index = label_targets_batch[sentID, word_index]
                    prob = softmax_predictions[sentID, word_index,ground_truth_word_index]
                    word_probs.append(prob)
                    word_index += 1

                log_probs = np.log(word_probs)

                perplexity = 2**(-1.0*log_probs.mean())
                
                if is_first_tuple:
                    print(perplexity, end=' ')
                    print(perplexity, end=' ', file=pplf)
                    is_first_tuple = False
                else:
                    print(perplexity)
                    print(perplexity, file=pplf)
                    is_first_tuple = True
            
            global_step += 1
示例#10
0
def best_cell_and_baseline_hyperparameters(train_dataloader, validate_dataloader, test_dataloader, embedding):
    configs = []

    RNN_config = {}
    RNN_config["model"] = "LSTM"
    RNN_config["hidden_size"] = 30
    RNN_config["num_layers"] = 3
    RNN_config["dropout"] = 0.9
    RNN_config["bidirectional"] = True
    RNN_config["fc1_width"] = "//"
    RNN_config["fc2_width"] = "//"
    RNN_config["vocab_size"] = -1
    RNN_config["lr"] = 0.0001
    RNN_config["optimizer"] = torch.optim.Adam

    baseline_config = {}
    baseline_config["model"] = "Baseline"
    baseline_config["hidden_size"] = "//"
    baseline_config["num_layers"] = "//"
    baseline_config["dropout"] = "//"
    baseline_config["bidirectional"] = "//"
    baseline_config["fc1_width"] = 150
    baseline_config["fc2_width"] = 150
    baseline_config["vocab_size"] = -1
    baseline_config["lr"] = 0.0001
    baseline_config["optimizer"] = torch.optim.Adam

    hyperparameters = {}
    hyperparameters["vocab_size"] = [50, 1000, 10000]
    hyperparameters["lr"] = [0.0001, 0.001, 0.01, 0.1]
    hyperparameters["dropout"] = [0, 0.2, 0.4, 0.6, 0.8, 1]
    hyperparameters["num_layers"] = [1, 3, 6]
    hyperparameters["hidden_size"] = [30, 100, 150, 200]
    hyperparameters["optimizer"] = [torch.optim.Adam, torch.optim.SGD, torch.optim.RMSprop]

    supports = {}
    supports["vocab_size"] = [BaselineModel, RNN.RecurrentModel]
    supports["lr"] = [BaselineModel, RNN.RecurrentModel]
    supports["dropout"] = [RNN.RecurrentModel]
    supports["num_layers"] = [RNN.RecurrentModel]
    supports["hidden_size"] = [RNN.RecurrentModel]
    supports["optimizer"] = [BaselineModel, RNN.RecurrentModel]

    initial_config = {}
    initial_config["clip"] = args.clip
    initial_config["epochs"] = args.epochs
    initial_config["input_width"] = 300
    initial_config["output_width"] = 1

    models = [BaselineModel, RNN.RecurrentModel]

    criterion = nn.BCEWithLogitsLoss()

    for model_type in models:
        for (key, values) in hyperparameters.items():
            # Skip this hyperparameter testing if the model does not support it
            if model_type not in supports[key]:
                continue

            for value in values:
                start = time.time()
                config = {}

                if model_type == RNN.RecurrentModel:
                    config.update(RNN_config)
                    train = RNN.train
                    evaluate = RNN.evaluate
                    model = RNN.RecurrentModel(config["model"], initial_config["input_width"],
                                               config["hidden_size"],
                                               initial_config["output_width"], config["num_layers"],
                                               config["bidirectional"], config["dropout"])
                else:
                    config.update(baseline_config)
                    train = baseline.train
                    evaluate = baseline.evaluate
                    model = BaselineModel(initial_config["input_width"], config["fc1_width"],
                                          config["fc2_width"], initial_config["output_width"])

                config.update(initial_config)
                config[key] = value

                print(config)

                optimizer = config["optimizer"](model.parameters(), lr=config["lr"])

                for epoch in range(args.epochs):
                    print(f'\nEpoch: {epoch}')
                    train(model, train_dataloader, optimizer, criterion, embedding, args.clip)
                    evaluate(model, validate_dataloader, criterion, embedding)
                accuracy, f1, confusion_matrix = evaluate(model, test_dataloader, criterion, embedding)
                config["accuracy"] = accuracy.item()
                config["f1"] = f1.item()
                config["TP"] = confusion_matrix[0, 0].item()
                config["FP"] = confusion_matrix[0, 1].item()
                config["FN"] = confusion_matrix[1, 0].item()
                config["TN"] = confusion_matrix[1, 1].item()

                end = time.time()
                config["time"] = end - start
                config["optimizer"] = method_to_string(config["optimizer"])
                configs.append(config)

    print_to_file("5_final.xls", "RNN baseline hyperparameters", configs)
示例#11
0
def best_cell_and_baseline_with_and_without_pretrained(train_dataset, train_dataloader, validate_dataloader,
                                                       test_dataloader, clip):
    configs = []

    RNN_config = {}
    RNN_config["model"] = "LSTM"
    RNN_config["hidden_size"] = 30
    RNN_config["num_layers"] = 3
    RNN_config["dropout"] = 0.9
    RNN_config["bidirectional"] = True
    RNN_config["fc1_width"] = "//"
    RNN_config["fc2_width"] = "//"

    baseline_config = {}
    baseline_config["model"] = "Baseline"
    baseline_config["hidden_size"] = "//"
    baseline_config["num_layers"] = "//"
    baseline_config["dropout"] = "//"
    baseline_config["bidirectional"] = "//"
    baseline_config["fc1_width"] = 150
    baseline_config["fc2_width"] = 150

    initial_config = {}
    initial_config["clip"] = args.clip
    initial_config["epochs"] = args.epochs
    initial_config["input_width"] = 300
    initial_config["output_width"] = 1

    lstm = RNN.RecurrentModel(RNN_config["model"], initial_config["input_width"], RNN_config["hidden_size"],
                              initial_config["output_width"], RNN_config["num_layers"],
                              RNN_config["bidirectional"], RNN_config["dropout"])

    base = BaselineModel(initial_config["input_width"], baseline_config["fc1_width"], baseline_config["fc2_width"],
                         initial_config["output_width"])
    models = [base, lstm]

    criterion = nn.BCEWithLogitsLoss()
    use_embeddings = [False, True]
    for use_embedding in use_embeddings:
        if use_embedding:
            file_path = util.config["glove_file_path"]
        else:
            file_path = None
        embedding_matrix = util.embedding(train_dataset.text_vocab, file_path)
        use_freeze = util.config["glove_file_path"] is not None
        embedding = torch.nn.Embedding.from_pretrained(embedding_matrix, padding_idx=0, freeze=use_freeze)

        for model in models:
            start = time.time()
            config = {}

            if type(model) == RNN.RecurrentModel:
                config.update(RNN_config)
                train = RNN.train
                evaluate = RNN.evaluate
            else:
                config.update(baseline_config)
                train = baseline.train
                evaluate = baseline.evaluate

            config.update(initial_config)
            config["pretrained"] = use_embedding

            print(config)

            optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

            for epoch in range(args.epochs):
                print(f'----------------------------\nEpoch: {epoch}')
                train(model, train_dataloader, optimizer, criterion, embedding, clip.clip)
                evaluate(model, validate_dataloader, criterion, embedding)
            accuracy, f1, confusion_matrix = evaluate(model, test_dataloader, criterion, embedding)
            config["accuracy"] = accuracy.item()
            config["f1"] = f1.item()
            config["TP"] = confusion_matrix[0, 0].item()
            config["FP"] = confusion_matrix[0, 1].item()
            config["FN"] = confusion_matrix[1, 0].item()
            config["TN"] = confusion_matrix[1, 1].item()

            end = time.time()
            config["time"] = end - start
            configs.append(config)

    print_to_file("4a_pretrained.xls", "Best RNN and baseline", configs)
示例#12
0
def mainFunc(argv):
    def printUsage():
        print('predict.py -n <num_cores> -x <experiment> -o <output file> -c <checkpoint>')
        print('num_cores = Number of cores requested from the cluster. Set to -1 to leave unset')
        print('experiment = experiment setup that should be executed. e.g \'baseline\'')
        print('checkpoint = Path to the checkpoint to load parameters from. e.g. \'./logs/baseline-ep4-500\'')
        print('output = where to write the prediction outputs to. e.g \'./predictions.out\'')

    def maptoword(sentence):
        return " ".join(map(lambda x: index_2_word[x], sentence))

    num_cores = -1
    experiment = ""
    checkpoint_filepath = ""
    output_filepath = ""
    # Command line argument handling
    try:
        opts, args = getopt.getopt(argv, "n:x:c:o:", ["num_cores=", "experiment=", "checkpoint=", "output="])
    except getopt.GetoptError:
        printUsage()
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            printUsage()
            sys.exit()
        elif opt in ("-n", "--num_cores"):
            num_cores = int(arg)
        elif opt in ("-x", "--experiment"):
            if arg in ("baseline", "attention"):
                experiment = arg
            else:
                printUsage()
                sys.exit(2)
        elif opt in ("-o", "--output"):
            if arg != "":
                output_filepath = arg
            else:
                printUsage()
                sys.exit(2)
        elif opt in ("-c", "--checkpoint"):
            if arg != "":
                checkpoint_filepath = arg
            else:
                printUsage()
                sys.exit(2)

    print("Executing experiment {} with {} CPU cores".format(experiment, num_cores))
    print("Loading checkpoint from {}".format(checkpoint_filepath))
    if num_cores != -1:
        # We set the op_parallelism_threads in the ConfigProto and pass it to the TensorFlow session
        configProto = tf.ConfigProto(inter_op_parallelism_threads=num_cores,
                                     intra_op_parallelism_threads=num_cores)
    else:
        configProto = tf.ConfigProto()

    print("Initializing model")
    model = None
    if experiment == "baseline":
        model = BaselineModel(vocab_size=conf.vocabulary_size,
                              embedding_size=conf.word_embedding_size,
                              bidirectional=conf.bidirectional_encoder,
                              attention=False,
                              dropout=conf.use_dropout,
                              num_layers=conf.num_layers,
                              is_training=False)

    elif experiment == "attention":
        model = BaselineModel(vocab_size=conf.vocabulary_size,
                              embedding_size=conf.word_embedding_size,
                              bidirectional=conf.bidirectional_encoder,
                              attention=True,
                              dropout=conf.use_dropout,
                              num_layers=conf.num_layers,
                              is_training=False)

    assert model != None
    # Materialize validation data
    validation_enc_inputs, validation_dec_inputs, word_2_index, index_2_word = get_data_by_type('eval')

    validation_input_lengths = set(map(lambda x: len(x), validation_enc_inputs))
    with tf.Session(config=configProto) as sess:
        global_step = 1
        sent_count = 1
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, checkpoint_filepath)

        print("Constructing language model")
        lm_logits_dict = construct_lm_logits(sess, model, validation_input_lengths)

        print("Using network to predict sentences..")
        with open(output_filepath, 'w') as out:
            for data_batch, data_sentence_lengths, _, label_targets_batch, label_sentence_lengths in tqdm(
                    bucket_by_sequence_length(validation_enc_inputs, validation_dec_inputs, conf.batch_size, sort_data=False, shuffle_batches=False, filter_long_sent=False),
                    total=ceil(len(validation_enc_inputs) / conf.batch_size)):

                lm_logits_batch = construct_lm_logits_batch(lm_logits_dict, data_sentence_lengths)
                feed_dict = model.make_inference_inputs(data_batch, data_sentence_lengths, lm_logits_batch)

                predictions = sess.run(model.decoder_prediction_inference, feed_dict)
                
                reversed_truncated_data = truncate_after_eos(data_batch)
                truncated_data = undo_input_reversal(reversed_truncated_data)
                truncated_labels = truncate_after_eos(label_targets_batch)
                truncated_predictions = truncate_after_eos(predictions)
                for enc, target, pred in zip(map(maptoword, truncated_data), map(maptoword, truncated_labels), map(maptoword, truncated_predictions)):
                    print("{}. Input:        {}".format(sent_count, enc), file=out)
                    print("{}. Ground Truth: {}".format(sent_count, target), file=out)
                    print("{}. Prediction:   {}".format(sent_count, pred), file=out)
                    sent_count += 1

                global_step += 1