def test_model(loader, model_path):
    word_set, word2idx, vocab_size = load_vocab(path.vocab_path)
    idx2word = dict(zip(word2idx.values(), word2idx.keys()))

    model = torch.load(model_path)
    #test_by_evidences(model, loader, idx2word) # 0.58 0.62
    test_by_questions(model, loader, idx2word)  # 0.61 0.66
def main():
    train = load_data('data/train.jsonl')
    token2id = load_vocab('data/vocab.json')
    vocab_size = get_vocab_size(token2id)
    embed_size = 20
    hidden_size = 40
    num_epochs = 15

    model = PyTorchModel(vocab_size, embed_size, hidden_size)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

    for _ in range(num_epochs):
        total_loss = 0
        with tqdm(train) as pbar:
            for i, instance in enumerate(pbar):
                source = torch.LongTensor(instance['source']).unsqueeze(0)
                target = torch.LongTensor(instance['target']).unsqueeze(0)
                loss = model(source, target)

                total_loss += loss.item()
                average_loss = total_loss / (i + 1)
                loss_str = f'{average_loss:.4f}'
                pbar.set_description(loss_str)

                tqdm.write(str(loss.item()))
                if torch.isnan(loss):
                    pbar.close()
                    exit()

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
示例#3
0
def ptb_raw_data(data_path=None, vocab_path=None):
    """Load PTB raw data from data directory "data_path".

  Reads PTB text files, converts strings to integer ids,
  and performs mini-batching of the inputs.

  The PTB dataset comes from Tomas Mikolov's webpage:

  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz

  Args:
    data_path: string path to the directory where simple-examples.tgz has
      been extracted.

  Returns:
    tuple (train_data, valid_data, test_data, vocabulary)
    where each of the data objects can be passed to PTBIterator.
  """

    train_path = os.path.join(data_path, "mrc.train.txt")
    valid_path = os.path.join(data_path, "mrc.valid.txt")
    test_path = os.path.join(data_path, "mrc.test.txt")

    if vocab_path == None:
        word_to_id = _build_vocab(train_path)
    else:
        word_to_id = load_vocab(vocab_path)
    print('load {} words'.format(len(word_to_id)))
    train_data = _file_to_word_ids(train_path, word_to_id)
    valid_data = _file_to_word_ids(valid_path, word_to_id)
    test_data = _file_to_word_ids(test_path, word_to_id)
    vocabulary = len(word_to_id)
    return train_data, valid_data, test_data, vocabulary
示例#4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="Input .npy training data", required=True)
    parser.add_argument("-s", "--seqlen", help="Sequence length")
    args = parser.parse_args()

    if not args.seqlen:
        sequence_length = DEFAULT_SEQ_LEN
    else:
        sequence_length = int(args.seqlen)

    df = pd.read_csv("data/songdata.zip")
    path = Path("chars.pkl")
    chars = list()
    if path.is_file():
        chars = util.load_vocab(path)
        print("Loaded from file")
    else:
        vocab = set()
        for song in df["text"]:
            chars = set(song)
            vocab = vocab.union(chars)
        chars = list(vocab)
        util.write_vocab(path, chars)
        print("Generated from source")
        
    vocab_size = len(chars)
    print("Vocab size:", vocab_size)
    
    data = np.load(args.input)
    X = data[:, :-1]
    Y = data[:, -1]

    kfold = KFold(n_splits=4)
    scores = np.zeros((4,))
    for (i, (train_index, test_index)) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        checkpoint = ModelCheckpoint("weights/weights_char_k{}_{}.h5".format(i, "{epoch:01d}"),
            monitor='loss',
            verbose=1,
            mode='auto',
            period=1,
            save_weights_only=True)

        model = build_model(sequence_length, vocab_size)
        model.fit_generator(generate_batches(X_train, Y_train, BATCH_SIZE, vocab_size), samples_per_epoch=300, epochs=10, callbacks=[checkpoint])
        perp = perplexity_score(model, X_test, Y_test, vocab_size)
        print("Local perplexity:", perp)
        scores[i] = perp

        del X_train, X_test, Y_train, Y_test
        del model
        gc.collect()
    print("Perplexity: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
示例#5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        help="Input .npy training data",
                        required=True)
    parser.add_argument("-v", "--vocab", help="Training vocab", required=True)
    parser.add_argument("-s", "--seqlen", help="Sequence length")
    args = parser.parse_args()

    if not args.seqlen:
        sequence_length = DEFAULT_SEQ_LEN
    else:
        sequence_length = int(sequence_length)

    w2v = KeyedVectors.load_word2vec_format('glove.6B.100d.bin.word2vec',
                                            binary=True)

    words = util.load_vocab(args.vocab)
    vocab_size = len(words)
    print("Vocab size:", vocab_size)
    print("W2V vocab size:", len(w2v.vocab))
    idx2word = {i: word for i, word in enumerate(words)}

    embedding_size = w2v.vector_size + util.EMBEDDING_EXT
    print("Embedding size:", embedding_size)

    data = np.load(args.input)
    X = data[:, :-1]
    Y = data[:, -1]

    kfold = KFold(n_splits=4)
    scores = np.zeros((4, ))
    for (i, (train_index, test_index)) in enumerate(kfold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        checkpoint = ModelCheckpoint("weights/weights_word_k{}_{}.h5".format(
            i, "{epoch:01d}"),
                                     monitor='loss',
                                     verbose=1,
                                     mode='auto',
                                     period=1,
                                     save_weights_only=True)

        model = build_model(vocab_size, sequence_length, embedding_size)
        model.fit_generator(generate_batches(X_train, Y_train, BATCH_SIZE,
                                             embedding_size, idx2word, w2v),
                            samples_per_epoch=300,
                            epochs=4,
                            callbacks=[checkpoint])
        perp = perplexity_score(model, X_test, Y_test, idx2word, w2v)
        print("Local perplexity:", perp)
        scores[i] = perp
    print("Perplexity: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def train_featQA(train_loader, val_loader, param):  # 2 is not weight
    param.model_dir = '../model/baselineQA_' + str(
        datetime.now()).split('.')[0].split()[0] + '/'
    if os.path.exists(param.model_dir) == False:
        os.mkdir(param.model_dir)

    word_set, word2idx, vocab_size = load_vocab(path.vocab_path)
    idx2word = dict(zip(word2idx.values(), word2idx.keys()))

    model = baselineQA(vocab_size, param, 0).cuda()
    train(model, train_loader, val_loader, param, idx2word)
示例#7
0
    def init_embedding(self, embedding_dim):
        # use `pre-trained` embedding layer, this trick is learned from machinelearningmastery.com
        vocab = load_vocab()
        word2idx = {w: i for i, w in enumerate(vocab)}
        raw_embedding = load_embedding(config.path_embedding)
        embedding_weight = get_weight_matrix(raw_embedding, word2idx)

        embedding = nn.Embedding(len(vocab), embedding_dim=embedding_dim)
        embedding.weight = nn.Parameter(
            torch.from_numpy(embedding_weight).float())

        return embedding
示例#8
0
def main():
  # load vocab fit to http://www.nature.com/articles/srep00196wget
  vocab = util.load_vocab('../fit/nature_and_kaggle_vocab.txt')
  foods = [tup[0] for tup in vocab]
  recipes = util.load_recipes('../dat/train.json')
  recipes = [recipe for recipe in recipes if len(recipe[1]) < 30]
  # recipes = recipes[0:1000]  # debug on smaller dataset
  country2region = get_country2region()
  # match the kaggle data to the nature data vocabulary
  # parsed_recipes = util.parse_recipes(foods, country2region, recipes)
  parsed_recipes = util.parse_recipes_parallel(foods, country2region, recipes)
  util.write_recipes(parsed_recipes, 'kaggle_recipes.csv')
def main():
  # load vocab fit to http://www.nature.com/articles/srep00196wget
  vocab = util.load_vocab('../fit/nature_and_kaggle_vocab.txt')
  foods = [tup[0] for tup in vocab]
  recipes = util.load_recipes('./train.json')
  recipes = [recipe for recipe in recipes if len(recipe[1]) < 30]
  # recipes = recipes[0:1000]  # debug on smaller dataset
  country2region = get_country2region()
  # match the kaggle data to the nature data vocabulary
  # parsed_recipes = util.parse_recipes(foods, country2region, recipes)
  parsed_recipes = util.parse_recipes_parallel(foods, country2region, recipes)
  util.write_recipes(parsed_recipes, 'kaggle_recipes.csv')
示例#10
0
def test(args):
    """ 予測を行うメソッド
    """

    batchsize   = args.batchsize  # バッチサイズ

    # 語彙辞書の読込
    src_vocab2id, src_id2vocab, vocab_size = util.load_vocab(args.model + ".srcvocab")

    # モデルの読込
    model = NLM.load_spec(args.model + ".spec")

    # GPUを使うかどうか
    if args.use_gpu:
        cuda.check_cuda_available()
        cuda.get_device(1).use()
        model.to_gpu()

    xp = cuda.cupy if args.use_gpu else np # args.gpu <= 0: use cpu, otherwise: use gpu
    serializers.load_hdf5(args.model + ".weights", model)

    # Source sequence for test
    print 'loading source data for test...'
    # データセット読み込み
    test_src_dataset = util.load_test_src_data(args.src, src_vocab2id)

    generated = 0
    N = len(test_src_dataset) # テストの事例数

    word_list = src_vocab2id.keys()

    # 単語wordのembeddingを取得
    word_id_list = Variable(xp.asarray([src_vocab2id[word] for word in word_list ], dtype=xp.int32))
    embedding_list = model.get_embedding(word_id_list)
    src_embed = embedding_list.data[word_list.index(args.src_word)]
    #print model.embed.W.data.shape

    print "src word:", args.src_word
    print src_embed
    #src_embed = model.embed.W.data[src_vocab2id[args.src_word]]

    trg_embed_list = {}
    for _word, _id in src_vocab2id.items():
        trg_embed = embedding_list.data[word_list.index(_word)]
        #trg_embed = model.embed.W.data[src_vocab2id[_word]]
        trg_embed_list[_word] = 1 - scipy.spatial.distance.cosine(src_embed, trg_embed)

    # 上位10件を表示
    for i, (word, sim) in enumerate(sorted(trg_embed_list.items(), key=lambda x:x[1], reverse=True)):
        print word, sim

        if i == 10:
            break
示例#11
0
    def from_paths(cls, weights_path, vocab_path, sequence_length):
        """
        Loads a character sampler from the specified paths.

        Args:
            weights_path: Path to the weights of the character-level language model
            vocab_path: Pickled character vocabulary file path
            sequence_length: Sequence length of the used model
        """
        chars = util.load_vocab(vocab_path)
        model = build_character_level_model(len(chars), sequence_length)
        model.load_weights(weights_path)
        return cls(model, chars, sequence_length)
示例#12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", "--seqlen", help="Sequence length")
    parser.add_argument("-o",
                        "--output",
                        help="Output .npy path",
                        required=True)
    args = parser.parse_args()

    if not args.seqlen:
        seq_len = DEFAULT_SEQ_LEN
    else:
        seq_len = int(args.seqlen)

    df = pd.read_csv(SRC_PATH)
    path = Path("chars.pkl")
    chars = list()
    if path.is_file():
        chars = util.load_vocab(path)
        print("Loaded vocabulary from file")
    else:
        vocab = set()
        for song in df["text"]:
            chars = set(song)
            vocab = vocab.union(chars)
        chars = list(vocab)
        util.write_vocab(path, chars)
        print("Generated character vocabulary as chars.pkl")

    vocab_size = len(chars)
    print("Vocab size:", vocab_size)
    char2idx = {char: i for i, char in enumerate(chars)}

    print("Generating training samples...")
    buffer_size = BUFFER_INC
    buffer = np.zeros((buffer_size, seq_len + 1), dtype=np.int64)
    i = 0
    for song in tqdm(df['text']):
        for xs in build_samples(song, seq_len):
            buffer[i] = [char2idx[x] for x in xs]
            i += 1

            if i >= buffer_size:
                buffer_size += BUFFER_INC
                buffer.resize((buffer_size, seq_len + 1))
    buffer.resize(i, seq_len + 1)
    print("Saving to {}...".format(args.output))
    np.save(args.output, buffer)
示例#13
0
    def from_paths(cls, weights_path, vocab_path, sequence_length):
        """
        Loads a word sampler from the specified paths.

        Args:
            weights_path: Path to the weights of the word-level language model
            vocab_path: Pickled word vocabulary file path
            sequence_length: Sequence length of the used model
        """

        w2v = KeyedVectors.load_word2vec_format('glove.6B.100d.bin.word2vec', binary=True)
        words = util.load_vocab(vocab_path)
        embedding_size = w2v.vector_size + util.EMBEDDING_EXT

        model = build_word_level_model(len(words), sequence_length, embedding_size)
        model.load_weights(weights_path)
        return cls(model, w2v, words, sequence_length, embedding_size)
def parse_scraped_site(in_path, out_path):
  """load vocab fit to nature and kaggle data
  from
  http://www.nature.com/articles/srep00196wget
  https://www.kaggle.com/c/whats-cooking/download/train.json.zip
  and match it to scraped allrecipes data
  """
  vocab = util.load_vocab('../fit/nature_and_kaggle_vocab.txt')
  foods = [tup[0] for tup in vocab]
  ingredients_lists = load_ingredients_lists(in_path)
  # ingredients_lists = ingredients_lists[0:1000]  # for debugging
  ingredients_lists = [util.filter_stopwords(l) for l in ingredients_lists]
  parsed_ingredients = util.parse_ingredients_parallel(
      foods, ingredients_lists)
  parsed_recipes = [("Unknown", ingredients) for
                    ingredients in parsed_ingredients]
  util.write_recipes(parsed_recipes, out_path)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        help="Input vocabulary",
                        required=True)
    parser.add_argument("-s", "--seqlen", help="Sequence length")
    parser.add_argument("-o",
                        "--output",
                        help="Output .npy path",
                        required=True)
    args = parser.parse_args()

    if not args.seqlen:
        seq_len = DEFAULT_SEQ_LEN
    else:
        seq_len = int(args.seqlen)

    print("Reading songs...")
    songs = None
    with open(SRC_PATH, "r") as f:
        songs = [line.rstrip().split(" ") for line in f]

    print("Loading vocab...")
    words = util.load_vocab(args.input)
    word2idx = {word: i for i, word in enumerate(words)}
    buffer_size = BUFFER_INC

    print("Generating ngrams...")
    buffer = np.zeros((buffer_size, seq_len + 1), dtype=np.int64)
    i = 0
    for song in tqdm(songs):
        for xs, y in ngramify(song, seq_len, word2idx):
            xs = [word2idx[x] for x in xs]
            y = word2idx[y]
            xs.append(y)
            buffer[i] = xs
            i += 1

            if i >= buffer_size:
                buffer_size += BUFFER_INC
                buffer.resize((buffer_size, seq_len + 1))
    buffer.resize(i, seq_len + 1)
    print("Saving to {}...".format(args.output))
    np.save(args.output, buffer)
示例#16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-t", "--threshold", help="Threshold to drop words")
    parser.add_argument("-o",
                        "--output",
                        help="Output .pkl path",
                        required=True)
    args = parser.parse_args()

    if not args.threshold:
        threshold = DEFAULT_THRESH
    else:
        threshold = float(args.threshold)
    print("Threshold:", threshold)

    print("Calculating word frequencies...")
    freqs = dict()
    with open(SRC_PATH, "r") as f:
        for line in f:
            for token in line.rstrip().split(" "):
                if token not in freqs:
                    freqs[token] = 1
                else:
                    freqs[token] += 1

    total_words = len(freqs.keys())
    discard = set()
    for word in freqs.keys():
        z = freqs[word] / total_words
        p = (math.sqrt(z / threshold) + 1) * (threshold / z)
        if random() <= p:
            discard.add(word)

    print("Total words:", total_words)
    print("Discarded words:", len(discard))
    print("Target vocab size:", total_words - len(discard))

    words = util.load_vocab(SRC_VOCAB_PATH)
    new_words = list(set(words).difference(discard))
    util.write_vocab(args.output, new_words)
    print("Vocab written to:", args.output)
示例#17
0
                           grad_norm, param_norm, iter_time, mean_length,
                           std_length))

            ## Checkpoint
            checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
            model.saver.save(sess,
                             checkpoint_path,
                             global_step=model.global_step)

            valid_costs, valid_lengths = [], []
            for source_tokens, source_mask, target_tokens, target_mask in PairIter(
                    x_dev, y_dev, FLAGS.batch_size, FLAGS.num_layers):
                cost, _ = model.test(sess, source_tokens, source_mask,
                                     target_tokens, target_mask)
                valid_costs.append(cost * target_mask.shape[1])
                valid_lengths.append(np.sum(target_mask[1:, :]))
            valid_cost = sum(valid_costs) / float(sum(valid_lengths))

            print("Epoch %d Validation cost: %f" % (epoch, valid_cost))

            previous_losses.append(valid_cost)
            if len(previous_losses) > 2 and valid_cost > max(
                    previous_losses[-3:]):
                sess.run(model.learning_rate_decay_op)
            sys.stdout.flush()


if __name__ == '__main__':
    np.random.seed(FLAGS.seed)
    word_idx_map, idx_word_map = load_vocab('')
示例#18
0
文件: main.py 项目: mana-ysh/deep-crf
def run(data_file, is_train=False, **args):
    is_test = not is_train
    batchsize = args['batchsize']
    model_name = args['model_name']
    optimizer_name = args['optimizer']
    save_dir = args['save_dir']
    print args
    if save_dir[-1] != '/':
        save_dir = save_dir + '/'

    # TODO: check save_dir exist
    if not os.path.isdir(save_dir):
        err_msg = 'There is no dir : {}\n'.format(save_dir)
        err_msg += '##############################\n'
        err_msg += '## Please followiing: \n'
        err_msg += '## $ mkdir {}\n'.format(save_dir)
        err_msg += '##############################\n'
        raise ValueError(err_msg)

    save_name = args['save_name']
    if save_name == '':
        save_name = '_'.join([model_name, optimizer_name])

    save_name = save_dir + save_name

    xp = cuda.cupy if args['gpu'] >= 0 else np
    efficient_gpu = False
    if args['gpu'] >= 0:
        cuda.get_device(args['gpu']).use()
        xp.random.seed(1234)
        efficient_gpu = args.get('efficient_gpu', False)

    def to_gpu(x):
        if args['gpu'] >= 0:
            return chainer.cuda.to_gpu(x)
        return x

    # load files
    dev_file = args['dev_file']
    test_file = args['test_file']
    delimiter = args['delimiter']
    input_idx = map(int, args['input_idx'].split(','))
    output_idx = map(int, args['output_idx'].split(','))
    word_input_idx = input_idx[0]  # NOTE: word_idx is first column!
    additional_input_idx = input_idx[1:]
    sentences_train = []
    if is_train:
        sentences_train = util.read_conll_file(filename=data_file,
                                               delimiter=delimiter)
        if len(sentences_train) == 0:
            s = str(len(sentences_train))
            err_msg = 'Invalid training sizes: {} sentences. '.format(s)
            raise ValueError(err_msg)
    else:
        # Predict
        sentences_train = util.read_raw_file(filename=data_file,
                                             delimiter=u' ')

    # sentences_train = sentences_train[:100]

    sentences_dev = []
    sentences_test = []
    if dev_file:
        sentences_dev = util.read_conll_file(dev_file, delimiter=delimiter)
    if test_file:
        sentences_test = util.read_conll_file(test_file, delimiter=delimiter)

    save_vocab = save_name + '.vocab'
    save_vocab_char = save_name + '.vocab_char'
    save_tags_vocab = save_name + '.vocab_tag'
    save_train_config = save_name + '.train_config'

    # TODO: check unkown pos tags
    # TODO: compute unk words
    vocab_adds = []
    if is_train:
        sentences_words_train = [[w_obj[word_input_idx] for w_obj in sentence]
                                 for sentence in sentences_train]
        vocab = util.build_vocab(sentences_words_train)
        vocab_char = util.build_vocab(util.flatten(sentences_words_train))
        vocab_tags = util.build_tag_vocab(sentences_train)

        # Additional setup
        for ad_feat_id in additional_input_idx:
            sentences_additional_train = [[
                feat_obj[ad_feat_id] for feat_obj in sentence
            ] for sentence in sentences_train]
            vocab_add = util.build_vocab(sentences_additional_train)
            vocab_adds.append(vocab_add)
    elif is_test:
        vocab = util.load_vocab(save_vocab)
        vocab_char = util.load_vocab(save_vocab_char)
        vocab_tags = util.load_vocab(save_tags_vocab)

    if args.get('word_emb_file', False):
        # set Pre-trained embeddings
        # emb_file = './emb/glove.6B.100d.txt'
        emb_file = args['word_emb_file']
        word_emb_vocab_type = args.get('word_emb_vocab_type')

        def assert_word_emb_shape(shape1, shape2):
            err_msg = '''Pre-trained embedding size is not equal to `--n_word_emb` ({} != {})'''
            if shape1 != shape2:
                err_msg = err_msg.format(str(shape1), str(shape2))
                raise ValueError(err_msg)

        def assert_no_emb(word_vecs):
            err_msg = '''There is no-embeddings! Please check your file `--word_emb_file`'''
            if word_vecs.shape[0] == 0:
                raise ValueError(err_msg)

        if word_emb_vocab_type == 'replace_all':
            # replace all vocab by Pre-trained embeddings
            word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(
                emb_file)
            vocab = vocab_glove
        elif word_emb_vocab_type == 'replace_only':
            word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab)
            assert_no_emb(word_vecs)

        elif word_emb_vocab_type == 'additional':
            word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(
                emb_file)
            additional_vecs = []
            for word, word_idx in sorted(vocab_glove.items(),
                                         key=lambda x: x[1]):
                if word not in vocab:
                    vocab[word] = len(vocab)
                    additional_vecs.append(word_vecs[word_idx])
            additional_vecs = np.array(additional_vecs, dtype=np.float32)

    if args.get('vocab_file', False):
        vocab_file = args['vocab_file']
        vocab = util.load_vocab(vocab_file)

    if args.get('vocab_char_file', False):
        vocab_char_file = args['vocab_char_file']
        vocab_char = util.load_vocab(vocab_char_file)

    vocab_tags_inv = dict((v, k) for k, v in vocab_tags.items())
    PAD_IDX = vocab[PADDING]
    UNK_IDX = vocab[UNKWORD]

    CHAR_PAD_IDX = vocab_char[PADDING]
    CHAR_UNK_IDX = vocab_char[UNKWORD]

    tmp_xp = xp
    if efficient_gpu:
        tmp_xp = np  # use CPU (numpy)

    def parse_to_word_ids(sentences, word_input_idx, vocab):
        return util.parse_to_word_ids(sentences,
                                      xp=tmp_xp,
                                      vocab=vocab,
                                      UNK_IDX=UNK_IDX,
                                      idx=word_input_idx)

    def parse_to_char_ids(sentences):
        return util.parse_to_char_ids(sentences,
                                      xp=tmp_xp,
                                      vocab_char=vocab_char,
                                      UNK_IDX=CHAR_UNK_IDX,
                                      idx=word_input_idx)

    def parse_to_tag_ids(sentences):
        return util.parse_to_tag_ids(sentences,
                                     xp=tmp_xp,
                                     vocab=vocab_tags,
                                     UNK_IDX=-1,
                                     idx=-1)

    x_train = parse_to_word_ids(sentences_train, word_input_idx, vocab)
    x_char_train = parse_to_char_ids(sentences_train)
    y_train = parse_to_tag_ids(sentences_train)
    x_train_additionals = [
        parse_to_word_ids(sentences_train, ad_feat_id, vocab_adds[i])
        for i, ad_feat_id in enumerate(additional_input_idx)
    ]

    x_dev = parse_to_word_ids(sentences_dev, word_input_idx, vocab)
    x_char_dev = parse_to_char_ids(sentences_dev)
    y_dev = parse_to_tag_ids(sentences_dev)
    x_dev_additionals = [
        parse_to_word_ids(sentences_dev, ad_feat_id, vocab_adds[i])
        for i, ad_feat_id in enumerate(additional_input_idx)
    ]

    y_dev_cpu = [[w[-1] for w in sentence] for sentence in sentences_dev]
    # tag_names = []
    tag_names = list(
        set([
            tag[2:] if len(tag) >= 2 else tag[0] for tag in vocab_tags.keys()
        ]))

    x_test = parse_to_word_ids(sentences_test, word_input_idx, vocab)
    x_char_test = parse_to_char_ids(sentences_test)
    y_test = parse_to_tag_ids(sentences_test)
    x_test_additionals = [
        parse_to_word_ids(sentences_test, ad_feat_id, vocab_adds[i])
        for i, ad_feat_id in enumerate(additional_input_idx)
    ]

    cnt_train_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_train])
    cnt_train_word = sum([d.size for d in x_train])
    unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word

    cnt_dev_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_dev])
    cnt_dev_word = sum([d.size for d in x_dev])
    unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1)

    logging.info('train:' + str(len(x_train)))
    logging.info('dev  :' + str(len(x_dev)))
    logging.info('test :' + str(len(x_test)))
    logging.info('vocab     :' + str(len(vocab)))
    logging.info('vocab_tags:' + str(len(vocab_tags)))
    logging.info('unk count (train):' + str(cnt_train_unk))
    logging.info('unk rate  (train):' + str(unk_train_unk_rate))
    logging.info('cnt all words (train):' + str(cnt_train_word))
    logging.info('unk count (dev):' + str(cnt_dev_unk))
    logging.info('unk rate  (dev):' + str(unk_dev_unk_rate))
    logging.info('cnt all words (dev):' + str(cnt_dev_word))
    # show model config
    logging.info('######################')
    logging.info('## Model Config')
    logging.info('model_name:' + str(model_name))
    logging.info('batchsize:' + str(batchsize))
    logging.info('optimizer:' + str(optimizer_name))
    # Save model config
    logging.info('######################')
    logging.info('## Model Save Config')
    logging.info('save_dir :' + str(save_dir))

    # save vocab
    logging.info('save_vocab        :' + save_vocab)
    logging.info('save_vocab_char   :' + save_vocab_char)
    logging.info('save_tags_vocab   :' + save_tags_vocab)
    logging.info('save_train_config :' + save_train_config)

    init_emb = None

    if is_train:
        util.write_vocab(save_vocab, vocab)
        util.write_vocab(save_vocab_char, vocab_char)
        util.write_vocab(save_tags_vocab, vocab_tags)
        util.write_vocab(save_train_config, args)

    n_vocab_add = [len(_vadd) for _vadd in vocab_adds]

    net = BiLSTM_CNN_CRF(n_vocab=len(vocab),
                         n_char_vocab=len(vocab_char),
                         emb_dim=args['n_word_emb'],
                         hidden_dim=args['n_hidden'],
                         n_layers=args['n_layer'],
                         init_emb=init_emb,
                         char_input_dim=args['n_char_emb'],
                         char_hidden_dim=args['n_char_hidden'],
                         n_label=len(vocab_tags),
                         n_add_feature_dim=args['n_add_feature_emb'],
                         n_add_feature=len(n_vocab_add),
                         n_vocab_add=n_vocab_add,
                         use_cudnn=args['use_cudnn'])
    my_cudnn(args['use_cudnn'])

    if args.get('word_emb_file', False):

        if word_emb_vocab_type == 'replace_all':
            # replace all vocab by Pre-trained embeddings
            assert_word_emb_shape(word_vecs.shape[1],
                                  net.word_embed.W.shape[1])
            net.word_embed.W.data = word_vecs[:]
        elif word_emb_vocab_type == 'replace_only':
            assert_no_emb(word_vecs)
            assert_word_emb_shape(word_vecs.shape[1],
                                  net.word_embed.W.shape[1])
            net.word_embed.W.data[word_ids] = word_vecs[:]

        elif word_emb_vocab_type == 'additional':
            assert_word_emb_shape(word_vecs.shape[1],
                                  net.word_embed.W.shape[1])
            v_size = additional_vecs.shape[0]
            net.word_embed.W.data[-v_size:] = additional_vecs[:]

    if args.get('return_model', False):
        return net

    if args['gpu'] >= 0:
        net.to_gpu()

    init_alpha = args['init_lr']
    if optimizer_name == 'adam':
        opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9)
    elif optimizer_name == 'adadelta':
        opt = optimizers.AdaDelta()
    if optimizer_name == 'sgd_mom':
        opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9)
    if optimizer_name == 'sgd':
        opt = optimizers.SGD(lr=init_alpha)

    opt.setup(net)
    opt.add_hook(chainer.optimizer.GradientClipping(5.0))

    def eval_loop(x_data, x_char_data, y_data, x_train_additionals=[]):
        # dev or test
        net.set_train(train=False)
        iteration_list = range(0, len(x_data), batchsize)
        # perm = np.random.permutation(len(x_data))
        sum_loss = 0.0
        predict_lists = []
        for i_index, index in enumerate(iteration_list):
            x = x_data[index:index + batchsize]
            x_char = x_char_data[index:index + batchsize]
            target_y = y_data[index:index + batchsize]

            if efficient_gpu:
                x = [to_gpu(_) for _ in x]
                x_char = [[to_gpu(_) for _ in words] for words in x_char]
                target_y = [to_gpu(_) for _ in target_y]

            x_additional = []
            if len(x_train_additionals):
                x_additional = [[
                    to_gpu(_) for _ in x_ad[index:index + batchsize]
                ] for x_ad in x_train_additionals]

            output = net(x_data=x,
                         x_char_data=x_char,
                         x_additional=x_additional)
            predict, loss = net.predict(output, target_y)

            sum_loss += loss.data
            predict_lists.extend(predict)

        _, predict_tags = zip(*predict_lists)
        predicted_results = []
        for predict in predict_tags:
            predicted = [
                vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)
            ]
            predicted_results.append(predicted)

        return predict_lists, sum_loss, predicted_results

    if args['model_filename']:
        model_filename = args['model_filename']
        serializers.load_hdf5(model_filename, net)

    if is_test:
        # predict
        # model_filename = args['model_filename']
        # model_filename = save_dir + model_filename
        # serializers.load_hdf5(model_filename, net)
        vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()])
        x_predict = x_train
        x_char_predict = x_char_train
        y_predict = y_train

        if dev_file:
            predict_dev, loss_dev, predict_dev_tags = eval_loop(
                x_dev, x_char_dev, y_dev)
            gold_predict_pairs = [y_dev_cpu, predict_dev_tags]
            result, phrase_info = util.conll_eval(gold_predict_pairs,
                                                  flag=False,
                                                  tag_class=tag_names)
            all_result = result['All_Result']
            print 'all_result:', all_result

        predict_pairs, _, _tmp = eval_loop(x_predict, x_char_predict,
                                           y_predict)
        _, predict_tags = zip(*predict_pairs)
        predicted_output = args['predicted_output']
        predicted_results = []
        for predict in predict_tags:
            predicted = [
                vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)
            ]
            predicted_results.append(predicted)

        f = open(predicted_output, 'w')
        for predicted in predicted_results:
            for tag in predicted:
                f.write(tag + '\n')
            f.write('\n')
        f.close()

        return False

    tmax = args['max_iter']
    t = 0.0
    prev_dev_accuracy = 0.0
    prev_dev_f = 0.0
    for epoch in xrange(args['max_iter']):

        # train
        net.set_train(train=True)
        iteration_list = range(0, len(x_train), batchsize)
        perm = np.random.permutation(len(x_train))
        sum_loss = 0.0
        predict_train = []
        for i_index, index in enumerate(iteration_list):
            data = [(x_train[i], x_char_train[i], y_train[i])
                    for i in perm[index:index + batchsize]]
            x, x_char, target_y = zip(*data)

            x_additional = []
            if len(x_train_additionals):
                x_additional = [[
                    to_gpu(x_ad[add_i])
                    for add_i in perm[index:index + batchsize]
                ] for x_ad in x_train_additionals]

            if efficient_gpu:
                x = [to_gpu(_) for _ in x]
                x_char = [[to_gpu(_) for _ in words] for words in x_char]
                target_y = [to_gpu(_) for _ in target_y]

            output = net(x_data=x,
                         x_char_data=x_char,
                         x_additional=x_additional)
            predict, loss = net.predict(output, target_y)

            # loss
            sum_loss += loss.data

            # update
            net.zerograds()
            loss.backward()
            opt.update()

            predict_train.extend(predict)

        # Evaluation
        train_accuracy = util.eval_accuracy(predict_train)

        logging.info('epoch:' + str(epoch))
        logging.info(' [train]')
        logging.info('  loss     :' + str(sum_loss))
        logging.info('  accuracy :' + str(train_accuracy))

        # Dev
        predict_dev, loss_dev, predict_dev_tags = eval_loop(
            x_dev, x_char_dev, y_dev, x_dev_additionals)

        gold_predict_pairs = [y_dev_cpu, predict_dev_tags]
        result, phrase_info = util.conll_eval(gold_predict_pairs,
                                              flag=False,
                                              tag_class=tag_names)
        all_result = result['All_Result']

        # Evaluation
        dev_accuracy = util.eval_accuracy(predict_dev)
        logging.info(' [dev]')
        logging.info('  loss     :' + str(loss_dev))
        logging.info('  accuracy :' + str(dev_accuracy))
        logging.info('  f_measure :' + str(all_result[-1]))

        dev_f = all_result[-1]

        if prev_dev_f < dev_f:
            logging.info(' [update best model on dev set!]')
            dev_list = [prev_dev_f, dev_f]
            dev_str = '       ' + ' => '.join(map(str, dev_list))
            logging.info(dev_str)
            prev_dev_f = dev_f

            # Save model
            model_filename = save_name + '_epoch' + str(epoch)
            serializers.save_hdf5(model_filename + '.model', net)
            serializers.save_hdf5(model_filename + '.state', opt)
示例#19
0
def run(data_file, is_train=False, **args):
    is_test = not is_train
    batchsize = args['batchsize']
    model_name = args['model_name']
    optimizer_name = args['optimizer']
    save_dir = args['save_dir']
    print args
    if save_dir[-1] != '/':
        save_dir = save_dir + '/'

    # TODO: check save_dir exist
    if not os.path.isdir(save_dir):
        err_msg = 'There is no dir : {}\n'.format(save_dir)
        err_msg += '##############################\n'
        err_msg += '## Please followiing: \n'
        err_msg += '## $ mkdir {}\n'.format(save_dir)
        err_msg += '##############################\n'
        raise ValueError(err_msg)

    save_name = args['save_name']
    if save_name == '':
        save_name = '_'.join([model_name, optimizer_name])

    save_name = save_dir + save_name

    xp = cuda.cupy if args['gpu'] >= 0 else np
    if args['gpu'] >= 0:
        cuda.get_device(args['gpu']).use()
        xp.random.seed(1234)

    # load files
    dev_file = args['dev_file']
    test_file = args['test_file']
    delimiter = args['delimiter']
    sentences_train = []
    if is_train:
        sentences_train = util.read_conll_file(filename=data_file,
                                               delimiter=delimiter,
                                               input_idx=0,
                                               output_idx=-1)
        if len(sentences_train) == 0:
            s = str(len(sentences_train))
            err_msg = 'Invalid training sizes: {} sentences. '.format(s)
            raise ValueError(err_msg)
    else:
        # Predict
        sentences_train = util.read_raw_file(filename=data_file,
                                             delimiter=u' ')

    # sentences_train = sentences_train[:100]

    sentences_dev = []
    sentences_test = []
    if dev_file:
        sentences_dev = util.read_conll_file(dev_file,
                                             delimiter=delimiter,
                                             input_idx=0,
                                             output_idx=-1)
    if test_file:
        sentences_test = util.read_conll_file(test_file,
                                              delimiter=delimiter,
                                              input_idx=0,
                                              output_idx=-1)

    save_vocab = save_name + '.vocab'
    save_vocab_char = save_name + '.vocab_char'
    save_tags_vocab = save_name + '.vocab_tag'
    save_train_config = save_name + '.train_config'

    # TODO: check unkown pos tags
    # TODO: compute unk words
    if is_train:
        sentences_words_train = [w_obj[0] for w_obj in sentences_train]
        vocab = util.build_vocab(sentences_words_train)
        vocab_char = util.build_vocab(util.flatten(sentences_words_train))
        vocab_tags = util.build_tag_vocab(sentences_train)
    elif is_test:
        vocab = util.load_vocab(save_vocab)
        vocab_char = util.load_vocab(save_vocab_char)
        vocab_tags = util.load_vocab(save_tags_vocab)

    PAD_IDX = vocab[PADDING]
    UNK_IDX = vocab[UNKWORD]

    CHAR_PAD_IDX = vocab_char[PADDING]
    CHAR_UNK_IDX = vocab_char[UNKWORD]

    def parse_to_word_ids(sentences):
        return util.parse_to_word_ids(sentences,
                                      xp=xp,
                                      vocab=vocab,
                                      UNK_IDX=UNK_IDX,
                                      idx=0)

    def parse_to_char_ids(sentences):
        return util.parse_to_char_ids(sentences,
                                      xp=xp,
                                      vocab_char=vocab_char,
                                      UNK_IDX=CHAR_UNK_IDX,
                                      idx=0)

    def parse_to_tag_ids(sentences):
        return util.parse_to_tag_ids(sentences,
                                     xp=xp,
                                     vocab=vocab_tags,
                                     UNK_IDX=-1,
                                     idx=-1)

    # if is_train:
    x_train = parse_to_word_ids(sentences_train)
    x_char_train = parse_to_char_ids(sentences_train)
    y_train = parse_to_tag_ids(sentences_train)

    # elif is_test:
    #     x_predict = parse_to_word_ids(sentences_predict)
    #     x_char_predict = parse_to_char_ids(sentences_predict)
    #     y_predict = parse_to_tag_ids(sentences_predict)

    x_dev = parse_to_word_ids(sentences_dev)
    x_char_dev = parse_to_char_ids(sentences_dev)
    y_dev = parse_to_tag_ids(sentences_dev)

    x_test = parse_to_word_ids(sentences_test)
    x_char_test = parse_to_char_ids(sentences_test)
    y_test = parse_to_tag_ids(sentences_test)

    cnt_train_unk = sum([xp.sum(d == UNK_IDX) for d in x_train])
    cnt_train_word = sum([d.size for d in x_train])
    unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word

    cnt_dev_unk = sum([xp.sum(d == UNK_IDX) for d in x_dev])
    cnt_dev_word = sum([d.size for d in x_dev])
    unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1)

    logging.info('train:' + str(len(x_train)))
    logging.info('dev  :' + str(len(x_dev)))
    logging.info('test :' + str(len(x_test)))
    logging.info('vocab     :' + str(len(vocab)))
    logging.info('vocab_tags:' + str(len(vocab_tags)))
    logging.info('unk count (train):' + str(cnt_train_unk))
    logging.info('unk rate  (train):' + str(unk_train_unk_rate))
    logging.info('cnt all words (train):' + str(cnt_train_word))
    logging.info('unk count (dev):' + str(cnt_dev_unk))
    logging.info('unk rate  (dev):' + str(unk_dev_unk_rate))
    logging.info('cnt all words (dev):' + str(cnt_dev_word))
    # show model config
    logging.info('######################')
    logging.info('## Model Config')
    logging.info('model_name:' + str(model_name))
    logging.info('batchsize:' + str(batchsize))
    logging.info('optimizer:' + str(optimizer_name))
    # Save model config
    logging.info('######################')
    logging.info('## Model Save Config')
    logging.info('save_dir :' + str(save_dir))

    # save vocab
    logging.info('save_vocab        :' + save_vocab)
    logging.info('save_vocab_char   :' + save_vocab_char)
    logging.info('save_tags_vocab   :' + save_tags_vocab)
    logging.info('save_train_config :' + save_train_config)
    util.write_vocab(save_vocab, vocab)
    util.write_vocab(save_vocab_char, vocab_char)
    util.write_vocab(save_tags_vocab, vocab_tags)
    util.write_vocab(save_train_config, args)

    net = BiLSTM_CNN_CRF(n_vocab=len(vocab),
                         n_char_vocab=len(vocab_char),
                         emb_dim=args['n_word_emb'],
                         hidden_dim=args['n_hidden'],
                         n_layers=args['n_layer'],
                         init_emb=None,
                         n_label=len(vocab_tags))

    if args['word_emb_file']:
        # set Pre-trained embeddings
        # emb_file = './emb/glove.6B.100d.txt'
        emb_file = args['word_emb_file']
        word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab)
        net.word_embed.W.data[word_ids] = word_vecs

    if args['gpu'] >= 0:
        net.to_gpu()

    init_alpha = args['init_lr']
    if optimizer_name == 'adam':
        opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9)
    elif optimizer_name == 'adadelta':
        opt = optimizers.AdaDelta()
    if optimizer_name == 'sgd_mom':
        opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9)
    if optimizer_name == 'sgd':
        opt = optimizers.SGD(lr=init_alpha)

    opt.setup(net)
    opt.add_hook(chainer.optimizer.GradientClipping(5.0))

    def eval_loop(x_data, x_char_data, y_data):
        # dev or test
        net.set_train(train=False)
        iteration_list = range(0, len(x_data), batchsize)
        # perm = np.random.permutation(len(x_data))
        sum_loss = 0.0
        predict_lists = []
        for i_index, index in enumerate(iteration_list):
            x = x_data[index:index + batchsize]
            x_char = x_char_data[index:index + batchsize]
            target_y = y_data[index:index + batchsize]

            output = net(x_data=x, x_char_data=x_char)
            predict, loss = net.predict(output, target_y)

            sum_loss += loss.data
            predict_lists.extend(predict)
        return predict_lists, sum_loss

    if is_test:
        # predict
        model_filename = args['model_filename']
        model_filename = save_dir + model_filename
        serializers.load_hdf5(model_filename, net)

        vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()])
        x_predict = x_train
        x_char_predict = x_char_train
        y_predict = y_train
        predict_pairs, _ = eval_loop(x_predict, x_char_predict, y_predict)
        _, predict_tags = zip(*predict_pairs)
        predicted_output = args['predicted_output']
        predicted_results = []
        for predict in predict_tags:
            predicted = [
                vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)
            ]
            predicted_results.append(predicted)

        f = open(predicted_output, 'w')
        for predicted in predicted_results:
            for tag in predicted:
                f.write(tag + '\n')
            f.write('\n')
        f.close()

        return False

    tmax = args['max_iter']
    t = 0.0
    for epoch in xrange(args['max_iter']):

        # train
        net.set_train(train=True)
        iteration_list = range(0, len(x_train), batchsize)
        perm = np.random.permutation(len(x_train))
        sum_loss = 0.0
        predict_train = []
        for i_index, index in enumerate(iteration_list):
            data = [(x_train[i], x_char_train[i], y_train[i])
                    for i in perm[index:index + batchsize]]
            x, x_char, target_y = zip(*data)

            output = net(x_data=x, x_char_data=x_char)
            predict, loss = net.predict(output, target_y)

            # loss
            sum_loss += loss.data

            # update
            net.zerograds()
            loss.backward()
            opt.update()

            predict_train.extend(predict)

        # Evaluation
        train_accuracy = util.eval_accuracy(predict_train)

        logging.info('epoch:' + str(epoch))
        logging.info(' [train]')
        logging.info('  loss     :' + str(sum_loss))
        logging.info('  accuracy :' + str(train_accuracy))

        # Dev
        predict_dev, loss_dev = eval_loop(x_dev, x_char_dev, y_dev)

        # Evaluation
        dev_accuracy = util.eval_accuracy(predict_dev)
        logging.info(' [dev]')
        logging.info('  loss     :' + str(loss_dev))
        logging.info('  accuracy :' + str(dev_accuracy))

        # Save model
        model_filename = save_name + '_epoch' + str(epoch)
        serializers.save_hdf5(model_filename + '.model', net)
        serializers.save_hdf5(model_filename + '.state', opt)
示例#20
0
            filtered, expanded, unique_coverage / len(filtered_vocab),
            token_coverage / sum(cnt for k, cnt in filtered_vocab)))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset', type=str, required=True)
    # parser.add_argument('--vocab_size', type=int, default=30000)
    parser.add_argument('--smaller', action="store_true")
    args = parser.parse_args()

    dataset = args.dataset
    # vocab_size = args.vocab_size
    smaller = args.smaller

    vocab, vocab_filtered = load_vocab(dataset, smaller)
    # # load dataset
    # if dataset.lower() == "reddit":
    #     dataset_path = "./data/Reddit/train-{0}.txt"
    # elif dataset.lower() == "twitter":
    #     dataset_path = "./data/Twitter/train-{0}.txt"
    # print("Loading dataset...")
    # src = load_file(dataset_path.format("src"))
    # tgt = load_file(dataset_path.format("tgt"))
    # data = src + tgt
    # print("number of sentence pairs: ", len(src))

    # # vocab
    # print("Building vocab...")
    # vocab = Counter([w for l in data for w in l])
    # print("vocab size: {0}".format(len(vocab)))
示例#21
0
    gamma = 0.1
    n_epochs = 10
    clip = 1
    teacher_forcing_ration = 1
    coverage_loss_lambda = 1.0
    eps = 0.0000001

    type = 'predict'

    if type == 'train':
        data_directory = os.path.join(parent_directory, 'data') + '\\'

        # 如果词典存在,则加载
        vocab = None
        if os.path.exists(vocab_file):
            vocab = load_vocab(vocab_file)

        # 加载训练数据
        source, train_data = build_field_dataset_vocab(data_directory,
                                                       chat_source_name,
                                                       chat_target_name, vocab)

        train_iterator, val_iterator = get_dataset(source, train_data,
                                                   batch_size)

        # 保存source的词典
        if vocab is None:
            save_vocab(source.vocab, vocab_file)

        model, optimizer, scheduler = build_model(
            source, encoder_embedding_dim, decoder_embedding_dim, hidden_dim,
    return top_n

def print_top_n(top_n):
    for k, ((words, i, candidate), prob) in enumerate(top_n):
        inserted = words[:i] + ['***%s***' % candidate] + words[i:]
        prediction = ' '.join(inserted)
        print "%d) P = %.3f: %s" % (k, -prob, prediction)

def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('model',
        help='KenLM n-gram model file (ARPA or binary)')
    parser.add_argument('vocab', type=argparse.FileType('r'),
        help='Vocab file')
    parser.add_argument('-n', type=int, default=5,
        help='Number of best sentences to report')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    
    print >>sys.stderr, "Loading vocab"
    vocab = load_vocab(args.vocab)
    print >>sys.stderr, "%d words in vocab" % len(vocab)
    print >>sys.stderr, "Loading language model"
    model = kenlm.LanguageModel(args.model)
    
    print >>sys.stderr, "Processing sentences"
    for line in sys.stdin:
        top_n = find_missing_word(model, vocab, line, args.n)
        print_top_n(top_n)
示例#23
0
import copy
import os
import random
from queue import Queue

import torch
from torch.autograd import Variable
import kenlm

import config
from dataset import Poemsets
from model import PoetryGenerator
import util

vocab = util.load_vocab()
commons = vocab[:2500]  # most frequent words
word2idx, idx2word = util.load_word2idx_idx2word(vocab)
icommons = [word2idx[w] for w in commons]
pingzes, yuns = util.read_pingshuiyun(use_index=True)
word2vec = util.load_word2vec()
LM = os.path.join(os.path.dirname(__file__), '..', 'data',
                  'rnnpg_data_emnlp-2014', 'partitions_in_Table_2', 'poemlm',
                  'qts.klm')
kmodel = kenlm.Model(LM)

poemset = Poemsets(
    os.path.join(config.dir_rnnpg, "partitions_in_Table_2", "rnnpg",
                 "qtrain_7"))
pg = PoetryGenerator(vocab_size=len(vocab),
                     embedding_dim=256,
 args = opts().parse_args()
 print "Loading golden sentences"
 golden = map(tokenize_words, args.golden)
 print "Loading locations of removed words"
 golden_loc = np.asarray(map(int, args.i_removed))
 print "Loading predictions"
 predictions = map(Prediction.parse, args.predicted)
 assert len(golden) == len(golden_loc)
 if len(predictions) < len(golden):
     n = len(predictions)
     golden = golden[:n]
     golden_loc = golden_loc[:n]
     print "Assuming first %d sentences" % n
 
 print "Loading syntactic n-gram counts"
 sngs = load_vocab(args.syntactic_ngrams)
 print "Loaded %d syntactic ngrams" % len(sngs)
 sngp = estimate_probabilities(sngs)
 print "Loading bad syntactic ngrams"
 bad_ngrams = set(pickle.load(args.bad_syntactic_ngrams))
 
 print "Computing cost for each choice"
 d = cost_per_choice(golden, golden_loc, predictions)
 print "Identifying optimal choices"
 y = np.argmin(d, axis=1)
 best = [di[yi] for di, yi in izip(d,y)]
 dx = np.mean(best)
 error = np.std(best) / np.sqrt(len(best))
 print "Best achievable Levenshtein distance: %.3f +/- %.3f" % (dx, error)
 
 unk = set(('<s>','</s>','<unk>',UNKNOWN))
from util import load_vocab
from online_util import get_inputs, get_answers, get_tuple_answers
from baiduSpider import get_evidences

STOP_TAG = "#OOV#"


class Hyperparameters:
    vocab_path = '../char_data/vocabulary.txt'
    random_path = '../char_data/training.h5'
    #charQA_path = '../model/charQA_2017-08-11/f1-0.5583_0.34799_2'
    charQA_path = '../model/lossQA_2017-08-14/f1-0.5698_0.26918_5'


param = Hyperparameters()
word_set, word2idx, word_set_size = load_vocab(param.vocab_path)
idx2word = dict(zip(word2idx.values(), word2idx.keys()))


def random_sample():
    file = h5py.File(param.random_path)
    nb_samples = len(file['question'][:])

    index = random.randint(0, nb_samples - 1)
    question = file['question'][index]
    question = ''.join([idx2word[q] for q in question if q != 0])
    return question


class baselineQA(object):
    def __init__(self):
示例#26
0
    torch.cuda.manual_seed(opt.seed)

# device
device_type = "cuda" if opt.cuda else "cpu"
device_ids = None
if opt.local_rank is not None:
    device_type += ":" + str(opt.local_rank)
    device_ids = [opt.local_rank]
device = torch.device(device_type)

# tensorboardX
writer = SummaryWriter()

# load vocabulary for source and target
src_vocab, trg_vocab = {}, {}
src_vocab["stoi"] = load_vocab(opt.src_vocab)
trg_vocab["stoi"] = load_vocab(opt.trg_vocab)
src_vocab["itos"] = invert_vocab(src_vocab["stoi"])
trg_vocab["itos"] = invert_vocab(trg_vocab["stoi"])
UNK = "<unk>"
SOS = "<sos>"
EOS = "<eos>"
PAD = "<pad>"
opt.enc_pad = src_vocab["stoi"][PAD]
opt.dec_sos = trg_vocab["stoi"][SOS]
opt.dec_eos = trg_vocab["stoi"][EOS]
opt.dec_pad = trg_vocab["stoi"][PAD]
opt.enc_num_token = len(src_vocab["stoi"])
opt.dec_num_token = len(trg_vocab["stoi"])

# load dataset for training and validation
示例#27
0
    tf.app.flags.DEFINE_integer("label_size", 2, "Size of the label.")
    tf.app.flags.DEFINE_float("learning_rate", 0.001, "Learning rate.")
    tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.95, "Learning rate decays by this much.")
    tf.app.flags.DEFINE_float("max_gradient_norm", 5.0, "Clip gradients to this norm.")
    tf.app.flags.DEFINE_float("dropout", 0.0, "Fraction of units randomly dropped on non-recurrent connections.")
    tf.app.flags.DEFINE_integer("batch_size", 50, "Batch size to use during training.")
    tf.app.flags.DEFINE_integer("epochs", 1, "Number of epochs to train.")
    tf.app.flags.DEFINE_integer("size", 512, "Size of each model layer.")
    tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.")
    tf.app.flags.DEFINE_string("train_dir", "/tmp", "Training weights are saved in it.")
    tf.app.flags.DEFINE_integer("print_every", 1, "How many iterations to do per print.")
    tf.app.flags.DEFINE_integer('seed', 1234, 'random seed')
    tf.app.flags.DEFINE_string('mode', 'sq2sq', 'Choose [sq2sq, att, gen]')
    tf.app.flags.DEFINE_string('gpu', '3', 'Choose GPU ID: [0,1,2,3]')
    tf.app.flags.DEFINE_string('embed', 'w2v', 'Choose embedding: [w2v, glove50, glove100, glove200, glove300]')

    word_idx_map, idx_word_map = load_vocab(VOCAB_PATH)
    vocab_size = len(idx_word_map)

    loader = StoryLoader(STORY_DATA_PATH,
                        batch_size=50, src_seq_len=65,
                        tgt_seq_len=20, mode='merged')

    if FLAGS.embed == 'w2v':
        embedding = loader.get_w2v_embed().astype('float32')

    model = StoryModel(vocab_size, FLAGS.label_size, FLAGS.size, FLAGS.num_layers,
                                   FLAGS.batch_size, FLAGS.learning_rate,
                                   FLAGS.learning_rate_decay_factor,
                                   FLAGS.dropout, embedding, FLAGS.src_steps, FLAGS.tgt_steps,
                                   FLAGS.mode, FLAGS.max_gradient_norm, forward_only=False)
示例#28
0
          exp_norm = 0.99*exp_norm + 0.01*grad_norm

        cost = cost / mean_length

        if current_step % FLAGS.print_every == 0:
          print('epoch %d, iter %d, cost %f, exp_cost %f, grad norm %f, param norm %f, batch time %f, length mean/std %f/%f' %
                (epoch, current_step, cost, exp_cost / exp_length, grad_norm, param_norm, iter_time, mean_length, std_length))

      ## Checkpoint
      checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
      model.saver.save(sess, checkpoint_path, global_step=model.global_step)

      valid_costs, valid_lengths = [], []
      for source_tokens, source_mask, target_tokens, target_mask in PairIter(x_dev, y_dev, FLAGS.batch_size, FLAGS.num_layers):
        cost, _ = model.test(sess, source_tokens, source_mask, target_tokens, target_mask)
        valid_costs.append(cost * target_mask.shape[1])
        valid_lengths.append(np.sum(target_mask[1:, :]))
      valid_cost = sum(valid_costs) / float(sum(valid_lengths))

      print("Epoch %d Validation cost: %f" % (epoch, valid_cost))

      previous_losses.append(valid_cost)
      if len(previous_losses) > 2 and valid_cost > max(previous_losses[-3:]):
        sess.run(model.learning_rate_decay_op)
      sys.stdout.flush()


if __name__ == '__main__':
    np.random.seed(FLAGS.seed)
    word_idx_map, idx_word_map = load_vocab('')
示例#29
0
文件: main.py 项目: mana-ysh/deep-crf
def run(data_file, is_train=False, **args):
    is_test = not is_train
    batchsize = args['batchsize']
    model_name = args['model_name']
    optimizer_name = args['optimizer']
    save_dir = args['save_dir']
    print args
    if save_dir[-1] != '/':
        save_dir = save_dir + '/'

    # TODO: check save_dir exist
    if not os.path.isdir(save_dir):
        err_msg = 'There is no dir : {}\n'.format(save_dir)
        err_msg += '##############################\n'
        err_msg += '## Please followiing: \n'
        err_msg += '## $ mkdir {}\n'.format(save_dir)
        err_msg += '##############################\n'
        raise ValueError(err_msg)

    save_name = args['save_name']
    if save_name == '':
        save_name = '_'.join([model_name, optimizer_name])

    save_name = save_dir + save_name

    xp = cuda.cupy if args['gpu'] >= 0 else np
    efficient_gpu = False
    if args['gpu'] >= 0:
        cuda.get_device(args['gpu']).use()
        xp.random.seed(1234)
        efficient_gpu = args.get('efficient_gpu', False)

    def to_gpu(x):
        if args['gpu'] >= 0:
            return chainer.cuda.to_gpu(x)
        return x

    # load files
    dev_file = args['dev_file']
    test_file = args['test_file']
    delimiter = args['delimiter']
    input_idx = map(int, args['input_idx'].split(','))
    output_idx = map(int, args['output_idx'].split(','))
    word_input_idx = input_idx[0]  # NOTE: word_idx is first column!
    additional_input_idx = input_idx[1:]
    sentences_train = []
    if is_train:
        sentences_train = util.read_conll_file(filename=data_file,
                                               delimiter=delimiter)
        if len(sentences_train) == 0:
            s = str(len(sentences_train))
            err_msg = 'Invalid training sizes: {} sentences. '.format(s)
            raise ValueError(err_msg)
    else:
        # Predict
        sentences_train = util.read_raw_file(filename=data_file,
                                             delimiter=u' ')

    # sentences_train = sentences_train[:100]

    sentences_dev = []
    sentences_test = []
    if dev_file:
        sentences_dev = util.read_conll_file(dev_file, delimiter=delimiter)
    if test_file:
        sentences_test = util.read_conll_file(test_file, delimiter=delimiter)

    save_vocab = save_name + '.vocab'
    save_vocab_char = save_name + '.vocab_char'
    save_tags_vocab = save_name + '.vocab_tag'
    save_train_config = save_name + '.train_config'

    # TODO: check unkown pos tags
    # TODO: compute unk words
    vocab_adds = []
    if is_train:
        sentences_words_train = [[w_obj[word_input_idx] for w_obj in sentence]
                                 for sentence in sentences_train]
        vocab = util.build_vocab(sentences_words_train)
        vocab_char = util.build_vocab(util.flatten(sentences_words_train))
        vocab_tags = util.build_tag_vocab(sentences_train)

        # Additional setup
        for ad_feat_id in additional_input_idx:
            sentences_additional_train = [[feat_obj[ad_feat_id] for feat_obj in sentence]
                                          for sentence in sentences_train]
            vocab_add = util.build_vocab(sentences_additional_train)
            vocab_adds.append(vocab_add)
    elif is_test:
        vocab = util.load_vocab(save_vocab)
        vocab_char = util.load_vocab(save_vocab_char)
        vocab_tags = util.load_vocab(save_tags_vocab)

    if args.get('word_emb_file', False):
        # set Pre-trained embeddings
        # emb_file = './emb/glove.6B.100d.txt'
        emb_file = args['word_emb_file']
        word_emb_vocab_type = args.get('word_emb_vocab_type')

        def assert_word_emb_shape(shape1, shape2):
            err_msg = '''Pre-trained embedding size is not equal to `--n_word_emb` ({} != {})'''
            if shape1 != shape2:
                err_msg = err_msg.format(str(shape1), str(shape2))
                raise ValueError(err_msg)

        def assert_no_emb(word_vecs):
            err_msg = '''There is no-embeddings! Please check your file `--word_emb_file`'''
            if word_vecs.shape[0] == 0:
                raise ValueError(err_msg)

        if word_emb_vocab_type == 'replace_all':
            # replace all vocab by Pre-trained embeddings
            word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(emb_file)
            vocab = vocab_glove
        elif word_emb_vocab_type == 'replace_only':
            word_ids, word_vecs = util.load_glove_embedding(emb_file, vocab)
            assert_no_emb(word_vecs)

        elif word_emb_vocab_type == 'additional':
            word_vecs, vocab_glove = util.load_glove_embedding_include_vocab(emb_file)
            additional_vecs = []
            for word, word_idx in sorted(vocab_glove.items(), key=lambda x: x[1]):
                if word not in vocab:
                    vocab[word] = len(vocab)
                    additional_vecs.append(word_vecs[word_idx])
            additional_vecs = np.array(additional_vecs, dtype=np.float32)

    if args.get('vocab_file', False):
        vocab_file = args['vocab_file']
        vocab = util.load_vocab(vocab_file)

    if args.get('vocab_char_file', False):
        vocab_char_file = args['vocab_char_file']
        vocab_char = util.load_vocab(vocab_char_file)

    vocab_tags_inv = dict((v, k) for k, v in vocab_tags.items())
    PAD_IDX = vocab[PADDING]
    UNK_IDX = vocab[UNKWORD]

    CHAR_PAD_IDX = vocab_char[PADDING]
    CHAR_UNK_IDX = vocab_char[UNKWORD]

    tmp_xp = xp
    if efficient_gpu:
        tmp_xp = np  # use CPU (numpy)

    def parse_to_word_ids(sentences, word_input_idx, vocab):
        return util.parse_to_word_ids(sentences, xp=tmp_xp, vocab=vocab,
                                      UNK_IDX=UNK_IDX, idx=word_input_idx)

    def parse_to_char_ids(sentences):
        return util.parse_to_char_ids(sentences, xp=tmp_xp, vocab_char=vocab_char,
                                      UNK_IDX=CHAR_UNK_IDX, idx=word_input_idx)

    def parse_to_tag_ids(sentences):
        return util.parse_to_tag_ids(sentences, xp=tmp_xp, vocab=vocab_tags,
                                     UNK_IDX=-1, idx=-1)

    x_train = parse_to_word_ids(sentences_train, word_input_idx, vocab)
    x_char_train = parse_to_char_ids(sentences_train)
    y_train = parse_to_tag_ids(sentences_train)
    x_train_additionals = [parse_to_word_ids(sentences_train, ad_feat_id, vocab_adds[i])
                           for i, ad_feat_id in enumerate(additional_input_idx)]

    x_dev = parse_to_word_ids(sentences_dev, word_input_idx, vocab)
    x_char_dev = parse_to_char_ids(sentences_dev)
    y_dev = parse_to_tag_ids(sentences_dev)
    x_dev_additionals = [parse_to_word_ids(sentences_dev, ad_feat_id, vocab_adds[i])
                         for i, ad_feat_id in enumerate(additional_input_idx)]

    y_dev_cpu = [[w[-1] for w in sentence]
                 for sentence in sentences_dev]
    # tag_names = []
    tag_names = list(set([tag[2:] if len(tag) >= 2 else tag[0] for tag in vocab_tags.keys()]))

    x_test = parse_to_word_ids(sentences_test, word_input_idx, vocab)
    x_char_test = parse_to_char_ids(sentences_test)
    y_test = parse_to_tag_ids(sentences_test)
    x_test_additionals = [parse_to_word_ids(sentences_test, ad_feat_id, vocab_adds[i])
                          for i, ad_feat_id in enumerate(additional_input_idx)]

    cnt_train_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_train])
    cnt_train_word = sum([d.size for d in x_train])
    unk_train_unk_rate = float(cnt_train_unk) / cnt_train_word

    cnt_dev_unk = sum([tmp_xp.sum(d == UNK_IDX) for d in x_dev])
    cnt_dev_word = sum([d.size for d in x_dev])
    unk_dev_unk_rate = float(cnt_dev_unk) / max(cnt_dev_word, 1)

    logging.info('train:' + str(len(x_train)))
    logging.info('dev  :' + str(len(x_dev)))
    logging.info('test :' + str(len(x_test)))
    logging.info('vocab     :' + str(len(vocab)))
    logging.info('vocab_tags:' + str(len(vocab_tags)))
    logging.info('unk count (train):' + str(cnt_train_unk))
    logging.info('unk rate  (train):' + str(unk_train_unk_rate))
    logging.info('cnt all words (train):' + str(cnt_train_word))
    logging.info('unk count (dev):' + str(cnt_dev_unk))
    logging.info('unk rate  (dev):' + str(unk_dev_unk_rate))
    logging.info('cnt all words (dev):' + str(cnt_dev_word))
    # show model config
    logging.info('######################')
    logging.info('## Model Config')
    logging.info('model_name:' + str(model_name))
    logging.info('batchsize:' + str(batchsize))
    logging.info('optimizer:' + str(optimizer_name))
    # Save model config
    logging.info('######################')
    logging.info('## Model Save Config')
    logging.info('save_dir :' + str(save_dir))

    # save vocab
    logging.info('save_vocab        :' + save_vocab)
    logging.info('save_vocab_char   :' + save_vocab_char)
    logging.info('save_tags_vocab   :' + save_tags_vocab)
    logging.info('save_train_config :' + save_train_config)

    init_emb = None

    if is_train:
        util.write_vocab(save_vocab, vocab)
        util.write_vocab(save_vocab_char, vocab_char)
        util.write_vocab(save_tags_vocab, vocab_tags)
        util.write_vocab(save_train_config, args)

    n_vocab_add = [len(_vadd) for _vadd in vocab_adds]

    net = BiLSTM_CNN_CRF(n_vocab=len(vocab), n_char_vocab=len(vocab_char),
                         emb_dim=args['n_word_emb'],
                         hidden_dim=args['n_hidden'],
                         n_layers=args['n_layer'], init_emb=init_emb,
                         char_input_dim=args['n_char_emb'],
                         char_hidden_dim=args['n_char_hidden'],
                         n_label=len(vocab_tags),
                         n_add_feature_dim=args['n_add_feature_emb'],
                         n_add_feature=len(n_vocab_add),
                         n_vocab_add=n_vocab_add,
                         use_cudnn=args['use_cudnn'])
    my_cudnn(args['use_cudnn'])

    if args.get('word_emb_file', False):

        if word_emb_vocab_type == 'replace_all':
            # replace all vocab by Pre-trained embeddings
            assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1])
            net.word_embed.W.data = word_vecs[:]
        elif word_emb_vocab_type == 'replace_only':
            assert_no_emb(word_vecs)
            assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1])
            net.word_embed.W.data[word_ids] = word_vecs[:]

        elif word_emb_vocab_type == 'additional':
            assert_word_emb_shape(word_vecs.shape[1], net.word_embed.W.shape[1])
            v_size = additional_vecs.shape[0]
            net.word_embed.W.data[-v_size:] = additional_vecs[:]

    if args.get('return_model', False):
        return net

    if args['gpu'] >= 0:
        net.to_gpu()

    init_alpha = args['init_lr']
    if optimizer_name == 'adam':
        opt = optimizers.Adam(alpha=init_alpha, beta1=0.9, beta2=0.9)
    elif optimizer_name == 'adadelta':
        opt = optimizers.AdaDelta()
    if optimizer_name == 'sgd_mom':
        opt = optimizers.MomentumSGD(lr=init_alpha, momentum=0.9)
    if optimizer_name == 'sgd':
        opt = optimizers.SGD(lr=init_alpha)

    opt.setup(net)
    opt.add_hook(chainer.optimizer.GradientClipping(5.0))

    def eval_loop(x_data, x_char_data, y_data, x_train_additionals=[]):
        # dev or test
        net.set_train(train=False)
        iteration_list = range(0, len(x_data), batchsize)
        # perm = np.random.permutation(len(x_data))
        sum_loss = 0.0
        predict_lists = []
        for i_index, index in enumerate(iteration_list):
            x = x_data[index:index + batchsize]
            x_char = x_char_data[index:index + batchsize]
            target_y = y_data[index:index + batchsize]

            if efficient_gpu:
                x = [to_gpu(_) for _ in x]
                x_char = [[to_gpu(_) for _ in words] for words in x_char]
                target_y = [to_gpu(_) for _ in target_y]

            x_additional = []
            if len(x_train_additionals):
                x_additional = [[to_gpu(_) for _ in x_ad[index:index + batchsize]]
                                for x_ad in x_train_additionals]

            output = net(x_data=x, x_char_data=x_char, x_additional=x_additional)
            predict, loss = net.predict(output, target_y)

            sum_loss += loss.data
            predict_lists.extend(predict)

        _, predict_tags = zip(*predict_lists)
        predicted_results = []
        for predict in predict_tags:
            predicted = [vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)]
            predicted_results.append(predicted)

        return predict_lists, sum_loss, predicted_results

    if args['model_filename']:
        model_filename = args['model_filename']
        serializers.load_hdf5(model_filename, net)

    if is_test:
        # predict
        # model_filename = args['model_filename']
        # model_filename = save_dir + model_filename
        # serializers.load_hdf5(model_filename, net)
        vocab_tags_inv = dict([(v, k) for k, v in vocab_tags.items()])
        x_predict = x_train
        x_char_predict = x_char_train
        y_predict = y_train

        if dev_file:
            predict_dev, loss_dev, predict_dev_tags = eval_loop(x_dev, x_char_dev, y_dev)
            gold_predict_pairs = [y_dev_cpu, predict_dev_tags]
            result, phrase_info = util.conll_eval(
                gold_predict_pairs, flag=False, tag_class=tag_names)
            all_result = result['All_Result']
            print 'all_result:', all_result

        predict_pairs, _, _tmp = eval_loop(x_predict, x_char_predict, y_predict)
        _, predict_tags = zip(*predict_pairs)
        predicted_output = args['predicted_output']
        predicted_results = []
        for predict in predict_tags:
            predicted = [vocab_tags_inv[tag_idx] for tag_idx in to_cpu(predict)]
            predicted_results.append(predicted)

        f = open(predicted_output, 'w')
        for predicted in predicted_results:
            for tag in predicted:
                f.write(tag + '\n')
            f.write('\n')
        f.close()

        return False

    tmax = args['max_iter']
    t = 0.0
    prev_dev_accuracy = 0.0
    prev_dev_f = 0.0
    for epoch in xrange(args['max_iter']):

        # train
        net.set_train(train=True)
        iteration_list = range(0, len(x_train), batchsize)
        perm = np.random.permutation(len(x_train))
        sum_loss = 0.0
        predict_train = []
        for i_index, index in enumerate(iteration_list):
            data = [(x_train[i], x_char_train[i], y_train[i])
                    for i in perm[index:index + batchsize]]
            x, x_char, target_y = zip(*data)

            x_additional = []
            if len(x_train_additionals):
                x_additional = [[to_gpu(x_ad[add_i]) for add_i in perm[index:index + batchsize]]
                                for x_ad in x_train_additionals]

            if efficient_gpu:
                x = [to_gpu(_) for _ in x]
                x_char = [[to_gpu(_) for _ in words] for words in x_char]
                target_y = [to_gpu(_) for _ in target_y]

            output = net(x_data=x, x_char_data=x_char, x_additional=x_additional)
            predict, loss = net.predict(output, target_y)

            # loss
            sum_loss += loss.data

            # update
            net.zerograds()
            loss.backward()
            opt.update()

            predict_train.extend(predict)

        # Evaluation
        train_accuracy = util.eval_accuracy(predict_train)

        logging.info('epoch:' + str(epoch))
        logging.info(' [train]')
        logging.info('  loss     :' + str(sum_loss))
        logging.info('  accuracy :' + str(train_accuracy))

        # Dev
        predict_dev, loss_dev, predict_dev_tags = eval_loop(
            x_dev, x_char_dev, y_dev, x_dev_additionals)

        gold_predict_pairs = [y_dev_cpu, predict_dev_tags]
        result, phrase_info = util.conll_eval(gold_predict_pairs, flag=False, tag_class=tag_names)
        all_result = result['All_Result']

        # Evaluation
        dev_accuracy = util.eval_accuracy(predict_dev)
        logging.info(' [dev]')
        logging.info('  loss     :' + str(loss_dev))
        logging.info('  accuracy :' + str(dev_accuracy))
        logging.info('  f_measure :' + str(all_result[-1]))

        dev_f = all_result[-1]

        if prev_dev_f < dev_f:
            logging.info(' [update best model on dev set!]')
            dev_list = [prev_dev_f, dev_f]
            dev_str = '       ' + ' => '.join(map(str, dev_list))
            logging.info(dev_str)
            prev_dev_f = dev_f

            # Save model
            model_filename = save_name + '_epoch' + str(epoch)
            serializers.save_hdf5(model_filename + '.model', net)
            serializers.save_hdf5(model_filename + '.state', opt)
示例#30
0
    # build the model according to params
    if not args.load_model:
        logger, opts = setup_exp(args)
        logger.info(args)
    else:
        logger = get_logger(args.expdir)
        with open(pjoin(args.load_model, 'opts.json'), 'r') as fin:
            loaded_opts = json.load(fin)
            for k in loaded_opts:
                if k not in ['expdir', 'load_model', 'seed', 'save_err']:
                    setattr(args, k, loaded_opts[k])
        logger.info(args)

    logger.info('loading data...')

    word_idx_map, idx_word_map = load_vocab(VOCAB_PATH)
    vocab_size = len(idx_word_map)

    if not args.pretrain:
        # we train on validation/test sets
        loader = StoryLoader(STORY_DATA_PATH,
                             batch_size=args.batch_size,
                             src_seq_len=65,
                             tgt_seq_len=20,
                             train_frac=0.45,
                             valid_frac=0.05,
                             mode='merged')
    else:
        # we only train on 40% of validation for the target encoder
        loader = StoryLoader(STORY_DATA_PATH,
                             batch_size=args.batch_size,
示例#31
0
import sys, argparse
from collections import defaultdict
from util import tokenize_words, load_vocab

def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('vocab', type=argparse.FileType('r'),
        help='File with vocabulary')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    
    print("Loading vocab", file=sys.stderr)
    vocab = load_vocab(args.vocab)
    print("Determining best-guess case for each word", file=sys.stderr)
    lower_to_case = {}
    for word, freq in vocab.items():
        lowercase = word.lower()
        if lowercase in lower_to_case:
            prev_freq = lower_to_case[lowercase][1]
            if freq > prev_freq:
                lower_to_case[lowercase] = (word, freq)
        else:
            lower_to_case[lowercase] = (word, freq)
    del vocab
    for k in list(lower_to_case.keys()):
        lower_to_case[k] = lower_to_case[k][0]
        
    print("Processing predictions", file=sys.stderr)
示例#32
0
#!/usr/bin/env python
'''Replace words with their word2vec class'''

import sys, argparse
from util import tokenize_words, load_vocab, UNKNOWN


def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('classes',
                        type=argparse.FileType('r'),
                        help='File with word2vec classes')
    return parser


if __name__ == "__main__":
    args = opts().parse_args()

    print("Loading word2vec classes", file=sys.stderr)
    vocab = load_vocab(args.classes)

    for i, line in enumerate(sys.stdin):
        words = [vocab.get(w, UNKNOWN) for w in tokenize_words(line)]
        print(' '.join(map(str, words)))

        if i % 100000 == 0:
            print(i, file=sys.stderr)
示例#33
0
 args = opts().parse_args()
 print("Loading golden sentences")
 golden = list(map(tokenize_words, args.golden))
 print("Loading locations of removed words")
 golden_loc = np.asarray(list(map(int, args.i_removed)))
 print("Loading predictions")
 predictions = list(map(Prediction.parse, args.predicted))
 assert len(golden) == len(golden_loc)
 if len(predictions) < len(golden):
     n = len(predictions)
     golden = golden[:n]
     golden_loc = golden_loc[:n]
     print("Assuming first %d sentences" % n)
 
 print("Loading syntactic n-gram counts")
 sngs = load_vocab(args.syntactic_ngrams)
 print("Loaded %d syntactic ngrams" % len(sngs))
 sngp = estimate_probabilities(sngs)
 print("Loading bad syntactic ngrams")
 bad_ngrams = set(pickle.load(args.bad_syntactic_ngrams))
 
 print("Computing cost for each choice")
 d = cost_per_choice(golden, golden_loc, predictions)
 print("Identifying optimal choices")
 y = np.argmin(d, axis=1)
 best = [di[yi] for di, yi in zip(d,y)]
 dx = np.mean(best)
 error = np.std(best) / np.sqrt(len(best))
 print("Best achievable Levenshtein distance: %.3f +/- %.3f" % (dx, error))
 
 unk = set(('<s>','</s>','<unk>',UNKNOWN))
示例#34
0
        embedding_weight = get_weight_matrix(raw_embedding, word2idx)

        embedding = nn.Embedding(len(vocab), embedding_dim=embedding_dim)
        embedding.weight = nn.Parameter(
            torch.from_numpy(embedding_weight).float())

        return embedding


if __name__ == '__main__':
    print("TOY EXAMPLE, JUST FOR TEST!!!")

    import numpy as np
    import torch.optim as optim

    vocab = load_vocab()
    word2idx, idx2word = load_word2idx_idx2word(vocab)
    poetry = "鹤 湖 东 去 水 茫 茫	一 面 风 泾 接 魏 塘	看 取 松 江 布 帆 至	鲈 鱼 切 玉 劝 郎 尝"
    sentences = [s.split() for s in poetry.split("\t")]
    isentences = [[word2idx[w] for w in s] for s in sentences]
    print(sentences)
    print(isentences)

    batch_size = 1
    epochs = 10  # 经过长时间的训练, 程序能够"记"住一些信息

    # optimizer parameters
    lr = 0.01
    decay_factor = 0.00001
    betas = (0.9, 0.999)
示例#35
0
    train_set = NameTaggingDataset(os.path.join(
        args.input, dataset, '{}train.tsv'.format(args.prefix)),
                                   conll_parser,
                                   gpu=use_gpu)
    dev_set = NameTaggingDataset(os.path.join(args.input, dataset,
                                              '{}dev.tsv'.format(args.prefix)),
                                 conll_parser,
                                 gpu=use_gpu)
    test_set = NameTaggingDataset(os.path.join(
        args.input, dataset, '{}test.tsv'.format(args.prefix)),
                                  conll_parser,
                                  gpu=use_gpu)

    # embedding vocab
    if args.embed_vocab:
        embed_vocab = load_vocab(args.embed_vocab)
    else:
        embed_vocab = build_embedding_vocab(args.embed)

    # vocabulary
    token_vocab = load_vocab(
        os.path.join(args.input, dataset,
                     '{}token.vocab.tsv'.format(args.prefix)))
    char_vocab = load_vocab(
        os.path.join(args.input, dataset,
                     '{}char.vocab.tsv'.format(args.prefix)))
    label_vocab = load_vocab(
        os.path.join(args.input, dataset,
                     '{}label.vocab.tsv'.format(args.prefix)))
    label_itos = {i: l for l, i in label_vocab.items()}
    train_token_counter = train_set.token_counter
示例#36
0
    config.read(os.path.join(parent_directory, 'resource') + '/config.cfg')
    section = config.sections()[0]

    parser = argparse.ArgumentParser(description='seq2seq_attention_chatbot')
    parser.add_argument('-type',
                        default='train',
                        help='train or predict with seq2seq!',
                        type=str)
    args = parser.parse_args()
    if args.type == 'train':
        data_directory = os.path.join(parent_directory, 'data') + '\\'

        #如果词典存在,则加载
        vocab = None
        if os.path.exists(config.get(section, 'vocab')):
            vocab = load_vocab(config.get(section, 'vocab'))

        #加载训练数据
        source, train_iterator, val_iterator = build_field_dataset_vocab(
            data_directory, config.get(section, 'chat_source_name'),
            config.get(section, 'chat_target_name'), vocab)
        #保存source的词典
        if vocab is None:
            save_vocab(source.vocab, config.get(section, 'vocab'))

        model, optimizer, scheduler, criterion = build_model(
            source, config.getint(section, 'encoder_embedding_dim'),
            config.getint(section, 'decoder_embedding_dim'),
            config.getint(section, 'hidden_dim'),
            config.getint(section, 'n_layers'),
            config.getfloat(section, 'encoder_dropout'),
示例#37
0
parser.add_argument('--info', type=str, help='info of the model')

opt = parser.parse_args()

# set the random seed manually
torch.manual_seed(opt.seed)

opt.cuda = opt.cuda and torch.cuda.is_available()
if opt.cuda:
    torch.cuda.manual_seed(opt.seed)

device = torch.device('cuda' if opt.cuda else 'cpu')

# load vocabulary for source and target
src_vocab, trg_vocab = {}, {}
src_vocab['stoi'] = load_vocab(opt.src_vocab)
trg_vocab['stoi'] = load_vocab(opt.trg_vocab)
src_vocab['itos'] = invert_vocab(src_vocab['stoi'])
trg_vocab['itos'] = invert_vocab(trg_vocab['stoi'])
UNK = '<unk>'
SOS = '<sos>'
EOS = '<eos>'
PAD = '<pad>'
opt.enc_pad = src_vocab['stoi'][PAD]
opt.dec_sos = trg_vocab['stoi'][SOS]
opt.dec_eos = trg_vocab['stoi'][EOS]
opt.dec_pad = trg_vocab['stoi'][PAD]
opt.enc_ntok = len(src_vocab['stoi'])
opt.dec_ntok = len(trg_vocab['stoi'])

# load dataset for testing
#!/usr/bin/env python

'''Replace words with their word2vec class'''

import sys, argparse
from util import tokenize_words, load_vocab, UNKNOWN

def opts():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('classes', type=argparse.FileType('r'),
        help='File with word2vec classes')
    return parser

if __name__ == "__main__":
    args = opts().parse_args()
    
    print >>sys.stderr, "Loading word2vec classes"
    vocab = load_vocab(args.classes)
        
    for i, line in enumerate(sys.stdin):
        words = [vocab.get(w, UNKNOWN) for w in tokenize_words(line)]
        print ' '.join(map(str, words))
        
        if i % 100000 == 0:
            print >>sys.stderr, i