示例#1
0
                        dest='verbose')
    parser.add_argument('--optim',
                        help='Optimizer algorithm',
                        default='adagrad',
                        choices=['adagrad', 'adadelta', 'adam'])

    args = parser.parse_args()

    utils.config_logger(args.verbose)
    logger = utils.get_logger('train')
    logger.info('Training with following options: %s' % ' '.join(sys.argv))
    #train_pairs = ioutils.read_corpus(args.train, args.lower, args.lang)
    #valid_pairs = ioutils.read_corpus(args.validation, args.lower, args.lang)
    # whether to generate embeddings for unknown, padding, null
    word_dict, embeddings = ioutils.load_embeddings(args.embeddings,
                                                    args.vocab,
                                                    True,
                                                    normalize=True)
    # test
    #print(word_dict)
    print(embeddings)
    logger.info('Converting words to indices')
    # find out which labels are there in the data
    # (more flexible to different datasets)
    """
    label_dict = utils.create_label_dict(train_pairs)
    train_data = utils.create_dataset(train_pairs, word_dict, label_dict)
    valid_data = utils.create_dataset(valid_pairs, word_dict, label_dict)
    """
    label_dict = utils.create_label_dict_SSQA()
    train_data, labels, sents1, sents2 = utils.create_dataset_SSQA(
        args.train, word_dict, label_dict)
        for i in vars(args):
            f.write(str(i) + "\t" + str(vars(args)[i]) + "\n")
        f.close()
    else:
        args.save = args.load

    utils.config_logger(args.verbose)
    logger = utils.get_logger('train')
    logger.info('Reading training data')
    train_pairs, train_max = ioutils.read_corpus(args.train, args.lower, args.lang, args.ratio)
    logger.info('Reading validation data')
    valid_pairs, valid_max = ioutils.read_corpus(args.validation, args.lower, args.lang)
    logger.info('Reading test data')
    test_pairs, test_max = ioutils.read_corpus(args.test, args.lower, args.lang)
    logger.info('Reading word embeddings')
    word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocab)
    max_len = None
    #print(train_pairs)
    #embeddings = utils.normalize_embeddings(embeddings)
    logger.debug('Embeddings have shape {} (including unknown, padding and null)'
                 .format(embeddings.shape))

    logger.info('Converting words to indices')
    # find out which labels are there in the data (more flexible to different datasets)
    label_dict = utils.create_label_dict(train_pairs)
    train_data = utils.create_dataset(train_pairs, word_dict, label_dict, max_len, max_len)
    valid_data = utils.create_dataset(valid_pairs, word_dict, label_dict, max_len, max_len)
    test_data = utils.create_dataset(test_pairs, word_dict, label_dict, max_len, max_len)

    #print(train_data.sizes1)
    ioutils.write_extra_embeddings(embeddings, args.save)
示例#3
0
        train_pairs += ioutils.read_corpus(args.additional_training,
                                           args.lower, args.lang)

    assert (not args.cont)  # Not implemented yet.

    # whether to generate embeddings for unknown, padding, null
    is_really_cont = args.warm != None or (args.cont and os.path.exists(
        os.path.join(args.save, "model.meta")))
    warmup_model = args.warm

    if is_really_cont:
        logger.info('Found a model. Fine-tuning...')

        word_dict, embeddings = ioutils.load_embeddings(
            args.embeddings,
            args.vocab,
            generate=False,
            normalize=True,
            load_extra_from=warmup_model)
        params = ioutils.load_params(warmup_model)

    else:
        word_dict, embeddings = ioutils.load_embeddings(args.embeddings,
                                                        args.vocab,
                                                        generate=True,
                                                        normalize=True)
        ioutils.write_params(args.save,
                             lowercase=args.lower,
                             language=args.lang,
                             model=args.model)
        ioutils.write_extra_embeddings(embeddings, args.save)
示例#4
0
                        dest='verbose')
    parser.add_argument('-e',
                        help='Print pairs and labels that got a wrong answer',
                        action='store_true',
                        dest='errors')
    args = parser.parse_args()

    utils.config_logger(verbose=args.verbose)
    params = ioutils.load_params(args.model)
    sess = tf.InteractiveSession()

    model_class = utils.get_model_class(params)
    model = model_class.load(args.model, sess)
    word_dict, embeddings = ioutils.load_embeddings(args.embeddings,
                                                    args.vocabulary,
                                                    generate=False,
                                                    load_extra_from=args.model,
                                                    normalize=True)
    model.initialize_embeddings(sess, embeddings)
    label_dict = ioutils.load_label_dict(args.model)
    print('Label dict[Y] : ', label_dict['Y'])
    #    pairs = ioutils.read_corpus(args.dataset, params['lowercase'],
    #           params['language'])
    #dataset = utils.create_dataset(pairs, word_dict, label_dict)
    dataset, labels, sents1, sents2 = utils.create_dataset_SSQA(
        args.dataset, word_dict, label_dict)
    #for pair in pairs:
    #print(pair[0].encode('utf-8'))
    #print(pair[1].encode('utf-8'))
    #print(pair[2].encode('utf-8'))
    loss, acc, answers = model.evaluate(sess, dataset, True)
示例#5
0
                        action='store_true',
                        dest='verbose',
                        default=1)
    parser.add_argument('--optim',
                        help='Optimizer algorithm',
                        default='adagrad',
                        choices=['adagrad', 'adadelta', 'adam'])

    args = parser.parse_args()

    utils.config_logger(args.verbose)
    logger = utils.get_logger('train')
    logger.debug('Training with following options: %s' % ' '.join(sys.argv))
    # hwwang change normalize to false
    word_dict, embeddings = ioutils.load_embeddings(args.embeddings,
                                                    args.vocab,
                                                    False,
                                                    normalize=False)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)

    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    logger.info('Creating model')
    vocab_size = embeddings.shape[0]
    embedding_size = embeddings.shape[1]
    #wordweightdict = ioutils.load_wordnetweight(args.wordnetweight)
    #wordweightdict = {}

    #  这里 lemma 之前的没有词对
    train_pairs, _ = ioutils.read_corpus(args.train, args.lower, args.lang)
    valid_pairs, _ = ioutils.read_corpus(args.validation, args.lower,
                                         args.lang)
    test_pairs, _ = ioutils.read_corpus(args.test, args.lower, args.lang)