示例#1
0
def main():
    args = parse_args()

    if args.dynet_seed:
        random.seed(args.dynet_seed)
        np.random.seed(args.dynet_seed)

    src_vocab = Vocabulary('<unk>', eos_symbol='</s>')
    tgt_vocab = Vocabulary('<unk>', sos_symbol='<s>', eos_symbol='</s>')
    train = list(
        read_bitext(src_vocab, tgt_vocab, args.train_src, args.train_tgt))
    src_vocab.freeze()
    tgt_vocab.freeze()
    dev = list(read_bitext(src_vocab, tgt_vocab, args.dev_src, args.dev_tgt))
    # init model
    model = Seq2SeqAtt(src_vocab, tgt_vocab, args.src_embed_dim,
                       args.tgt_embed_dim, args.enc_nlayers,
                       args.enc_hidden_dim, args.dec_nlayers,
                       args.dec_hidden_dim, args.attention_dim,
                       args.label_smoothing)
    if args.saved_model:
        model.load_model(args.saved_model)
    if args.only_decode:
        print("Reading test data...")
        test = list(
            read_bitext(src_vocab, tgt_vocab, args.test_src, args.test_tgt))
        model.translate(test, args.beam_size, args.max_output_len,
                        args.length_norm, args.output_file, args.relative,
                        args.absolute, args.local, args.candidate)
        print("Done")
    else:
        training_procedure = BasicTrainingProcedure(
            model, dy.SimpleSGDTrainer(model.pc))
        training_procedure.train(args.epochs, train, dev, args.batch_size,
                                 args.batch_size, args.max_output_len)
示例#2
0
def load_word_data(questions_df, image_captions, exclude_word_list):
    vocab = Vocabulary()
    answers = Vocabulary(first_word="RELEVANT")
    specific_answers = Vocabulary()
    question_seq_length = 1
    caption_seq_length = 1

    print "Generating vocabulary and answer indices..."
    new_questions = []
    for _, row in questions_df.iterrows():
        question_words = row['question'].split(' ')

        if len(question_words) > question_seq_length:
            question_seq_length = len(question_words)

        all_words = question_words

        image_file = row['image_file']
        if image_file in image_captions:
            caption = image_captions[image_file]
            caption_words = caption.split(' ')
            if len(caption_words) > caption_seq_length:
                caption_seq_length = len(caption_words)
            all_words += caption_words

        for word in all_words:
            if len(word) > 0 and word not in exclude_word_list:
                vocab.add_word(word)
        # if row['relevant'] == 0:
        answers.add_word(row['answer'])
        specific_answers.add_word(row['specific_answer'])

    print '\tVocab count: [%d]' % (len(vocab))
    print '\tAnswers count: [%d]' % (len(answers))
    print '\tQuestion sequence length: [%d]' % (question_seq_length)
    print '\tCaption sequence length: [%d]' % (caption_seq_length)

    print "Loading word vectors..."
    word_to_vector = load_word_vectors(word_vectors_file, vocab)

    print 'Creating embedding matrix...'
    embedding_matrix = np.zeros((len(vocab), embedding_dim))

    words_not_found = []
    for word, i in vocab.word_index.items():
        if word not in word_to_vector:
            words_not_found.append(word)
            continue
        embedding_vector = word_to_vector[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    if len(words_not_found) > 0:
        print "Words not found:", "\n\t", words_not_found
        for word in words_not_found:
            del vocab.index_word[vocab.word_index[word]]

    return vocab, answers, specific_answers, embedding_matrix, word_to_vector, question_seq_length, caption_seq_length
示例#3
0
    def __init__(self, data_dir, mode, vocab_size):

        self.df = pd.read_csv(os.path.join(data_dir, mode + '.csv'))

        self.sentences = self.df['text'].values
        self.labels = self.df['label'].values

        # Initialize dataset Vocabulary object and build our vocabulary
        self.sentences_vocab = Vocabulary(vocab_size)
        self.labels_vocab = Vocabulary(vocab_size)

        self.sentences_vocab.build_vocabulary(self.sentences)
        self.labels_vocab.build_vocabulary(self.labels, add_unk=False)
示例#4
0
    def toShakespeare(self):
        """Given a line of text, return that text in the indicated style.
        
        Args:
          modern_text: (string) The input.
          
        Returns:
          string: The translated text, if generated.
        """

        args = load_arguments()
        vocab = Vocabulary(self.vocab_path, args.embedding, args.dim_emb)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            model = Model(args, vocab)
            model.saver.restore(sess, args.model)

            if args.beam > 1:
                decoder = beam_search.Decoder(sess, args, vocab, model)
            else:
                decoder = greedy_decoding.Decoder(sess, args, vocab, model)

                batch = get_batch([self.modern_text], [1], vocab.word2id)
                ori, tsf = decoder.rewrite(batch)

                out = ' '.join(w for w in tsf[0])

        return out
def main():

    args = _parse_args()

    assert not(os.path.exists(args.model)), f"specified file already exists: {args.model}"

    with io.open(args.corpus, mode="r") as corpus:
        v = Vocabulary(table_size=int(2E7))
        v.create(corpus, [(args.n_vocab, args.n_min_freq, args.n_min_freq)])

    print(f"finished. saving models: {args.model}")
    v.save(args.model)

    # sanity check
    print("done. now execute sanity check...")
    print(f"n_vocab: {len(v)}, total_freq:{sum(v.counts)}")

    s = "Knox County Health Department is following national Centers for Disease Control and Prevention Protocol to contain infection."
    print(f"sentence: {s}")
    s_tokenized = "/".join(v.tokenize(s, remove_oov=False))
    print(f"tokenized: {s_tokenized}")
    print(f"random sampling...")
    n_sample = 100
    x = v.random_ids(n_sample)
    w, f = np.unique(list(map(v.id2word, x)), return_counts=True)
    for idx in np.argsort(f)[::-1]:
        print(f"{w[idx]} -> {f[idx]}")

    print("finished. good-bye.")
示例#6
0
def train():
    with open('train_config.json') as train_config_file:
        train_config = json.load(train_config_file)
    train_data_path = train_config['train_data_path']
    test_data_path = train_config['test_data_path']
    vocab_path = train_config['vocab_path']

    train_input_data, train_input_label = load_corpus(
        file_path=train_data_path, make_vocab=True, vocab_path=vocab_path)
    val_input_data, val_input_label = load_corpus(file_path=test_data_path,
                                                  make_vocab=False)

    vocab = Vocabulary(vocab_path)

    model = Spacing(vocab_len=len(vocab))

    print(model)

    trainer = Trainer(model=model,
                      vocab=vocab,
                      train_data=train_input_data,
                      train_label=train_input_label,
                      val_data=val_input_data,
                      val_label=val_input_label,
                      config=train_config)
    trainer.train(total_epoch=10, validation_epoch=1)
示例#7
0
def ngrams(prefix):
    """
    Find n-grams and make a vocabulary from the parsed corpus
    """
    with BZ2File(prefix + 'corpus.bz2', 'r') as corpus:
        vocab = Vocabulary(build_table=False)
        vocab.create(corpus, [(75000, 350), (25000, 350), (10000, 350)])
        vocab.save(prefix + 'vocab.gz')
    def read_vocabs(self, datafile, corpus_name):
        lines = open(datafile, encoding="utf-8").read().strip().split('\n')

        pairs = [[self.normalize_string(s) for s in line.split('\t')]
                 for line in lines]
        vocab = Vocabulary(corpus_name)

        return vocab, pairs
示例#9
0
def prepare_data():
    lines = open(config.TXT_DATA).read().strip().split('\n')
    pairs = [[snippet for snippet in line.split("$")] for line in lines]

    source_vocab = Vocabulary(config.SOURCE)
    target_vocab = Vocabulary(config.TARGET)

    for pair in pairs:
        source_vocab.add_sentence(pair[0])
        target_vocab.add_sentence(pair[1])

    random.shuffle(pairs)

    eval_pairs = pairs[:int(len(pairs) * config.EVAL_PERCENTAGE)]
    train_pairs = pairs[int(len(pairs) * config.EVAL_PERCENTAGE):]

    return source_vocab, target_vocab, train_pairs, eval_pairs
    def __init__(self, data_path, vocab=Vocabulary(), predict=False):
        """
        Creates an object that gets data from a file.
        """
        super(Data, self).__init__(data_path, vocab)

        if not predict:
            self._train_test_split()
示例#11
0
def main():
    args = get_arguments()
    SETTING = Dict(yaml.safe_load(open(os.path.join('arguments',args.arg+'.yaml'), encoding='utf8')))
    print(args)
    args.device = list (map(str,args.device))
    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device)

    # image transformer
    transform = transforms.Compose([
        transforms.Resize((SETTING.imsize, SETTING.imsize)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
        ])

    if args.dataset == 'coco':
        val_dset = CocoDset(root=SETTING.root_path, img_dir='val2017', ann_dir='annotations/captions_val2017.json', transform=transform)
    val_loader = DataLoader(val_dset, batch_size=SETTING.batch_size, shuffle=False, num_workers=SETTING.n_cpu, collate_fn=collater)

    vocab = Vocabulary(max_len=SETTING.max_len)
    vocab.load_vocab(args.vocab_path)

    imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type)
    capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size, SETTING.rnn_type)

    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    imenc = imenc.to(device)
    capenc = capenc.to(device)

    assert args.checkpoint is not None
    print("loading model and optimizer checkpoint from {} ...".format(args.checkpoint), flush=True)
    ckpt = torch.load(args.checkpoint, map_location=device)
    imenc.load_state_dict(ckpt["encoder_state"])
    capenc.load_state_dict(ckpt["decoder_state"])

    begin = time.time()
    dset = EmbedDset(val_loader, imenc, capenc, vocab, args)
    print("database created | {} ".format(sec2str(time.time()-begin)), flush=True)

    savedir = os.path.join("out", args.config_name)
    if not os.path.exists(savedir):
        os.makedirs(savedir, 0o777)

    image = dset.embedded["image"]
    caption = dset.embedded["caption"]
    n_i = image.shape[0]
    n_c = caption.shape[0]
    all = np.concatenate([image, caption], axis=0)

    emb_file = os.path.join(savedir, "embedding_{}.npy".format(n_i))
    save_file = os.path.join(savedir, "{}.npy".format(SETTING.method))
    vis_file = os.path.join(savedir, "{}.png".format(SETTING.method))
    np.save(emb_file, all)
    print("saved embeddings to {}".format(emb_file), flush=True)
    dimension_reduction(emb_file, save_file, method=SETTING.method)
    plot_embeddings(save_file, n_i, vis_file, method=SETTING.method)
示例#12
0
def main():

    args = get_arguments()
    SETTING = Dict(
        yaml.safe_load(
            open(os.path.join('arguments', args.arg + '.yaml'),
                 encoding='utf8')))
    print(args)
    args.device = list(map(str, args.device))
    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(args.device)

    transform = transforms.Compose([
        transforms.Resize((SETTING.imsize, SETTING.imsize)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    if args.dataset == 'coco':
        val_dset = CocoDset(root=SETTING.root_path,
                            img_dir='val2017',
                            ann_dir='annotations/captions_val2017.json',
                            transform=transform)
    val_loader = DataLoader(val_dset,
                            batch_size=SETTING.batch_size,
                            shuffle=False,
                            num_workers=SETTING.n_cpu,
                            collate_fn=collater)

    vocab = Vocabulary(max_len=SETTING.max_len)
    vocab.load_vocab(args.vocab_path)

    imenc = ImageEncoder(SETTING.out_size, SETTING.cnn_type)
    capenc = CaptionEncoder(len(vocab), SETTING.emb_size, SETTING.out_size,
                            SETTING.rnn_type)

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    imenc = imenc.to(device)
    capenc = capenc.to(device)

    assert SETTING.checkpoint is not None
    print("loading model and optimizer checkpoint from {} ...".format(
        SETTING.checkpoint),
          flush=True)
    ckpt = torch.load(SETTING.checkpoint)
    imenc.load_state_dict(ckpt["encoder_state"])
    capenc.load_state_dict(ckpt["decoder_state"])

    begin = time.time()
    dset = EmbedDset(val_loader, imenc, capenc, vocab, args)
    print("database created | {} ".format(sec2str(time.time() - begin)),
          flush=True)

    retrieve_i2c(dset, val_dset, args.image_path, imenc, transform)
    retrieve_c2i(dset, val_dset, args.output_dir, args.caption, capenc, vocab)
    def __init__(self, data_path, vocab=Vocabulary()):
        self.vocab = vocab

        data = get_requests_from_file(data_path)
        print("Downloaded {} samples".format(len(data)))

        map_result = map(self._process_request, data)
        self.data = [x[0] for x in map_result]
        self.lengths = [x[1] for x in map_result]
        assert len(self.data) == len(self.lengths)
示例#14
0
def load_or_create_vocab(trainDataset=None, testDataset=None):
    Texts = list(trainDataset.anns.values()) + list(testDataset.anns.values())
    if os.path.exists(VOCAB_FILE):
        print("loading vocab")
        vocab = torch.load(VOCAB_FILE)
        print("vocab loaded")
        return vocab
    else:
        vocab = Vocabulary()
        vocab.create_from_texts(Texts)
        return vocab
示例#15
0
 def __init__(self, csv_path, image_path, transform=None, batch_size=4):
     self.captionsfile = pd.read_csv(csv_path)
     self.image_path = image_path
     self.transform = transform
     self.vocab = Vocabulary(vocab_threshold=2)
     self.batch_size = batch_size
     all_tokens = [
         nltk.tokenize.word_tokenize(
             str(self.captionsfile.iloc[index, 2]).lower())
         for index in range(len(self.captionsfile))
     ]
     self.caption_lengths = [len(token) for token in all_tokens]
示例#16
0
def prepare_data(args):
    '''
    Do all the job about preparing data.
    :param args:
    :return:
    '''
    trainset = REDataset(args.trainset_path, double_data=args.is_double_training_data)
    testset = REDataset(args.testset_path)

    # make vocab
    vocab = Vocabulary(word_num=args.vocab_word_num)
    corpus = []
    for example in trainset:
        corpus += example[0]

    if args.vocab_include_testset:
        for example in testset:
            corpus += example[0]

    vocab.add_from_corpus(corpus)

    # make label encoder
    all_labels = []
    for example in trainset:
        all_labels.append(example[1])
    label_encoder = LabelEncoder(all_labels)

    batch_maker = BatchMaker(vocab, label_encoder, max_length=args.max_length)
    traindata_loader = DataLoader(trainset, batch_size=args.batch_size, shuffle=args.shuffle,
                                  num_workers=args.num_workers, collate_fn=batch_maker.batch_packer)
    testdata_loader = DataLoader(testset, batch_size=args.test_batch_size, shuffle=args.test_shuffle,
                                 num_workers=args.num_workers, collate_fn=batch_maker.batch_packer)

    logger.info('trainset length: %d' % len(trainset))
    logger.info('testset length: %d' % len(testset))
    logger.info('vocabulary length: %d'%len(vocab))
    logger.info('labels num: %d'%len(label_encoder))

    return (traindata_loader, testdata_loader, trainset, testset, vocab, label_encoder, batch_maker)

# dataset = REDataset(TRAINSET_PATH)
# corpus = []
# for example in dataset:
#     corpus += example[0]
#
# vocab = Vocabulary(word_num=3000)
# vocab.add_from_corpus(corpus)
# sent = dataset[0][0]
# print(sent)
# print(vocab.encode(sent))
# print(vocab.decode(vocab.encode(sent)))
# print(dataset[0][-1])
示例#17
0
def build_vocab(words):
    ''' Build vocabulary and use it to format labels. '''
    vocab = Vocabulary(words)

    # Map words to word embedding vectors.
    output_vector = []
    for word in words:
        zeros = np.zeros(len(vocab), dtype=np.float32)
        zeros[vocab[word]] = 1.0

        output_vector.append(zeros)

    return vocab, output_vector
示例#18
0
    def __init__(self,
                 filenames,
                 char_vocab=None,
                 feat_vocab=None,
                 pos_vocab=None,
                 pos_sp=True,
                 train=True,
                 covered=False):
        super().__init__()
        if isinstance(filenames, list):
            self.filenames = filenames
        elif isinstance(filenames, str):
            self.filenames = [filenames]
        else:
            raise ValueError
        self.train = train
        if char_vocab is None or feat_vocab is None or pos_vocab is None:
            assert char_vocab is None and feat_vocab is None and pos_vocab is None  # should be None at the same time
        if char_vocab is None:  # if None, create new vocabs
            self.char_vocab = Vocabulary(unk=True,
                                         pad=True,
                                         bos=True,
                                         eos=True)
            self.feat_vocab = Vocabulary(unk=True)
            self.pos_vocab = Vocabulary(unk=True)
            self.m_char_vocab = self.char_vocab
        else:  # else, load existing vocabs
            self.char_vocab = char_vocab
            self.feat_vocab = feat_vocab
            self.pos_vocab = pos_vocab
            self.m_char_vocab = Vocabulary.from_vocab(char_vocab)

        self.raw_data = []
        self.data = []
        self.organized_data = []
        self.data_sizes = []
        self.pos_sp = pos_sp
        self.covered = covered
        self.build_dataset()
示例#19
0
def inference():
    with open('train_config.json') as train_config_file:
        train_config = json.load(train_config_file)

    vocab_path = train_config['vocab_path']
    model_save_path = train_config['model_save_path']

    epoch = None
    with open(os.path.join(model_save_path, 'checkpoint.txt')) as f:
        epoch = f.readlines()[0].split(':')[1]
        print(f'Weight is loaded from best checkpoint epoch {epoch}')

    vocab = Vocabulary(vocab_path)

    model = Spacing(vocab_len=len(vocab)).eval()

    trainer = Trainer(model=model,
                      vocab=vocab,
                      config=train_config)
    trainer.load(epoch)

    while True:
        text = input('Enter input text : ')
        words = text.split()
        data = []

        for word in words:
            chars = [char for char in word]
            data.append(chars)
        sorted_data = sorted(data, key=lambda e: len(e), reverse=True)
        idx = sorted(range(len(data)), key=lambda e: len(data[e]), reverse=True)
        batch_data, batch_label, lengths = trainer.make_input_tensor(sorted_data, None)

        outputs, _ = trainer.model.forward(batch_data, lengths)
        outputs = torch.round(outputs)

        results = []
        for output, data in zip(outputs, sorted_data):
            result = ''
            for output_char, char in zip(output, data):
                if output_char == 1:
                    result += (char + ' ')
                else:
                    result += char
            results.append(result)

        sorted_result = ''
        for i in range(len(idx)):
            sorted_result += results[idx.index(i)]

        print(sorted_result)
示例#20
0
def main():
    args = parse_args()

    transform = transforms.Compose([
        transforms.Resize(args.imsize_pre),
        transforms.CenterCrop(args.imsize),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    if args.dataset == "coco":
        val_dset = CocoDataset(
            root=args.root_path,
            split="val",
            transform=transform,
        )
    val_loader = DataLoader(
        val_dset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.n_cpu,
        collate_fn=collater,
    )

    vocab = Vocabulary(max_len=args.max_len)
    vocab.load_vocab(args.vocab_path)

    model = SPVSE(
        len(vocab),
        args.emb_size,
        args.out_size,
        args.max_len,
        args.cnn_type,
        args.rnn_type,
        pad_idx=vocab.padidx,
        bos_idx=vocab.bosidx,
    )

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    model = model.to(device)

    assert args.checkpoint is not None
    print("loading model and optimizer checkpoint from {} ...".format(
        args.checkpoint),
          flush=True)
    ckpt = torch.load(args.checkpoint, map_location=device)
    model.load_state_dict(ckpt["model_state"])
    _ = validate(1000, val_loader, model, vocab, args)
示例#21
0
def build_vocab(datafile, threshold):
    counter = Counter()

    with open(datafile, 'r') as f:
        data = json.load(f)

    for caption in tqdm(list(map(lambda x: x['caption'], data))):
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

    tokens = [token for token, count in counter.items() if count >= threshold]
    vocab = Vocabulary()
    vocab.add_tokens(tokens)
    return vocab
示例#22
0
def creatVocab(datalist, is_tags):
    vocab = Vocabulary() if is_tags else TokenVocabulary()
    word_counts = Counter(chain(*datalist))
    valid_words = [w for w, d in word_counts.items()]
    valid_words = sorted(valid_words,
                         key=lambda x: word_counts[x],
                         reverse=True)
    valid_words += ['<pad>']
    for token in valid_words:
        vocab.add_token(token)
    if not is_tags:
        unk_index = vocab.add_token('<unk>')
        vocab.set_unk_index(unk_index)
    return vocab
示例#23
0
    def __init__(self, data_path, train=False, longest_sequence_length=None):

        data0 = load_sent(data_path + '.0')
        data1 = load_sent(data_path + '.1')
        print(
            f'\n------------------------ Building a Dataset ------------------------'
        )
        print(f'#sents of {data_path}.0 file 0: {len(data0)}'
              )  # list of list of tokenized words
        print(f'#sents of {data_path}.1 file 1: {len(data1)}'
              )  # list of list of tokenized words

        self.data_all = data0 + data1
        self.style_list = [0 for i in data0] + [
            1 for i in data1
        ]  # data0 is all neg, data1 is all pos

        # sorting all the data according to their seq lengths in descending order
        zip_item = zip(self.data_all, self.style_list)
        sorted_item = sorted(zip_item, key=lambda p: len(p[0]), reverse=True)
        tuple_item = zip(*sorted_item)
        self.data_all, self.style_list = [list(t) for t in tuple_item]

        print(f'len(self.data_all)  : {len(self.data_all)}')
        print(f'len(self.style_list): {len(self.style_list)}')

        if train:
            print('\ntrain: True')
            if not os.path.isfile(cfg.vocab):
                print(f'{cfg.vocab} does not exist')
                print('Building Vocab...')
                build_vocab(data0 + data1, cfg.vocab)
            else:
                print(f'{cfg.vocab} already exists')

        self.vocab = Vocabulary(cfg.vocab, cfg.embedding_file, cfg.embed_dim)
        print('\nvocabulary size:', self.vocab.size)
        print(
            f'vocabulary embedding matrix shape: {self.vocab.embedding.shape}')
        # print(type(self.vocab.embedding)) # np array

        self.longest_sequence_length = longest_sequence_length

        if longest_sequence_length is None:
            self.update_the_max_length()

        print(f'self.longest_sequence_length: {self.longest_sequence_length}')
        print(
            f'--------------------------------------------------------------------'
        )
示例#24
0
文件: eval.py 项目: yiskw713/VSE
def main():
    args = parse_args()

    transform = transforms.Compose([
        transforms.Resize((args.imsize, args.imsize)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    if args.dataset == 'coco':
        val_dset = CocoDataset(root=args.root_path,
                               imgdir='val2017',
                               jsonfile='annotations/captions_val2017.json',
                               transform=transform,
                               mode='all')
    val_loader = DataLoader(val_dset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.n_cpu,
                            collate_fn=collater_eval)

    vocab = Vocabulary(max_len=args.max_len)
    vocab.load_vocab(args.vocab_path)

    imenc = ImageEncoder(args.out_size, args.cnn_type)
    capenc = CaptionEncoder(len(vocab), args.emb_size, args.out_size,
                            args.rnn_type)

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    imenc = imenc.to(device)
    capenc = capenc.to(device)

    assert args.checkpoint is not None
    print("loading model and optimizer checkpoint from {} ...".format(
        args.checkpoint),
          flush=True)
    ckpt = torch.load(args.checkpoint)
    imenc.load_state_dict(ckpt["encoder_state"])
    capenc.load_state_dict(ckpt["decoder_state"])

    begin = time.time()
    dset = EmbedDataset(val_loader, imenc, capenc, vocab, args)
    print("database created | {} ".format(sec2str(time.time() - begin)),
          flush=True)

    retrieve_i2c(dset, val_dset, imenc, vocab, args)
    retrieve_c2i(dset, val_dset, capenc, vocab, args)
示例#25
0
def transform_text(text):
    tf.compat.v1.disable_eager_execution()
    args = load_arguments()
    ah = vars(args)
    ah['vocab'] = '../model/yelp.vocab'
    ah['model'] = '../model/model'
    ah['load_model'] = True
    ah['beam'] = 8
    ah['batch_size'] = 1
    inp = [text]

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print('vocabulary size:', vocab.size)

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.compat.v1.Session(config=config) as sess:
        model = create_model(sess, args, vocab)
        decoder = beam_search.Decoder(sess, args, vocab, model)
        '''test_losses = transfer(model, decoder, sess, args, vocab,
                               test0, test1, args.output)'''

        batches, order0, order1 = get_batches(inp, inp, vocab.word2id,
                                              args.batch_size)

        data0_tsf, data1_tsf = [], []
        losses = Accumulator(len(batches), ['loss', 'rec', 'adv', 'd0', 'd1'])

        # rec, tsf = decoder.rewrite(inp)

        # print(rec)
        # print(tsf)
        for batch in batches:
            rec, tsf = decoder.rewrite(batch)
            half = batch['size'] // 2
            print("rec:")
            print(rec)
            print("tsf:")
            print(tsf)
            data0_tsf += tsf[:half]
            data1_tsf += tsf[half:]
        n0, n1 = len(inp), len(inp)
        data0_tsf = reorder(order0, data0_tsf)[:n0]
        data1_tsf = reorder(order1, data1_tsf)[:n1]
        print(data0_tsf)
        print(data1_tsf)
def run_evaluation(corpus_dir, save_dir, datafile, config_file):
    config = Config.from_json_file(config_file)
    vocab = Vocabulary("words")

    # set checkpoint to load from; set to None if starting from scratch
    load_filename = os.path.join(
        save_dir, config.model_name, config.corpus_name,
        '{}-{}_{}'.format(config.encoder_n_layers, config.decoder_n_layers,
                          config.hidden_size), 'last_checkpoint.tar')

    # if loading on the same machine the model trained on
    checkpoint = torch.load(load_filename)
    # if loading a model trained on gpu to cpu
    # checkpoint = torch.load(load_filename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint["en"]
    decoder_sd = checkpoint["de"]
    encoder_optimizer_sd = checkpoint["en_opt"]
    decoder_optimizer_sd = checkpoint["de_opt"]
    embedding_sd = checkpoint["embedding"]
    vocab.__dict__ = checkpoint["voc_dict"]

    print("Building encoder and decoder ...")
    # initialize word embeddings
    embedding = nn.Embedding(vocab.num_words, config.hidden_size)
    embedding.load_state_dict(embedding_sd)

    # initialize encoder and decoder models
    encoder = EncoderRNN(config.hidden_size, embedding,
                         config.encoder_n_layers, config.dropout)
    decoder = LuongAttnDecoderRNN(config.attn_model, embedding,
                                  config.hidden_size, vocab.num_words,
                                  config.decoder_n_layers, config.dropout)

    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

    # Set dropout layers to eval mode

    encoder.eval()
    decoder.eval()

    # Initialize search module
    searcher = GreedySearchDecoder(encoder, decoder)

    # Begin chatting (uncomment and run the following line to begin)
    evaluate_input(encoder, decoder, searcher, vocab)
示例#27
0
def create_vocab(qas, threshold=4):
    counter = Counter()
    for qa in qas:
        question = qa['question'].encode('utf-8')
        answer = qa['answer'].encode('utf-8')
        qtokens = nltk.tokenize.word_tokenize(question.lower())
        atokens = nltk.tokenize.word_tokenize(answer.lower())
        counter.update(qtokens)
        counter.update(atokens)

    # If a word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Adds the words to the vocabulary.
    vocab = Vocabulary()
    for word in words:
        vocab.add_word(word)
    return vocab
示例#28
0
def vectorize(question1,question2,is_duplicate):
    from vocab import Vocabulary

    v = Vocabulary()


    # Vectorize the data.
    input_texts = []
    target_texts = []
    input_characters = set()
    target_characters = set()
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    for line in lines[: min(num_samples, len(lines) - 1)]:
        input_text, target_text = line.split('\t')
        # We use "tab" as the "start sequence" character
        # for the targets, and "\n" as "end sequence" character.
        target_text = '\t' + target_text + '\n'
        input_texts.append(input_text)
        target_texts.append(target_text)
        for char in input_text:
            if char not in input_characters:
                input_characters.add(char)
        for char in target_text:
            if char not in target_characters:
                target_characters.add(char)

    input_characters = sorted(list(input_characters))
    target_characters = sorted(list(target_characters))
    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_length = max([len(txt) for txt in input_texts])
    max_decoder_seq_length = max([len(txt) for txt in target_texts])

    print('Number of samples:', len(input_texts))
    print('Number of unique input tokens:', num_encoder_tokens)
    print('Number of unique output tokens:', num_decoder_tokens)
    print('Max sequence length for inputs:', max_encoder_seq_length)
    print('Max sequence length for outputs:', max_decoder_seq_length)

    input_token_index = dict(
        [(char, i) for i, char in enumerate(input_characters)])
    target_token_index = dict(
        [(char, i) for i, char in enumerate(target_characters)])
示例#29
0
def generate(decoder,
             prime_str='A',
             predict_len=100,
             temperature=0.8,
             cuda=False,
             voc=None):
    if voc == None:
        voc = Vocabulary('voc')
        file, file_len = read_file('tiny.txt')  # Reads file as giant string

        for w in file:
            voc.add_word(w)

    hidden = decoder.init_hidden(1)
    prime_input = Variable(voc.word_tensor(prime_str).unsqueeze(0))
    if cuda:
        hidden = hidden.cuda()
        prime_input = prime_input.cuda()
    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[:, p], hidden)

    inp = prime_input[:, -1]

    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        top_index = top_i.item()
        # Add predicted character to string and use as next input
        predicted_word = voc.to_word(top_index)
        predicted += predicted_word
        inp = torch.tensor([top_index])
        if cuda:
            inp = inp.cuda()

    return predicted
示例#30
0
    if args.train or args.latent_train:
        chosen = args.train if len(args.train) > len(args.latent_train) else \
          args.latent_train
        # train0 = load_sent(chosen + '.0', args.max_train_size)
        # train1 = load_sent(chosen + '.1', args.max_train_size)

        train0 = load_sent(chosen + 'formal', args.max_train_size)
        train1 = load_sent(chosen + 'informal', args.max_train_size)

        print('#sents of training file 0:', len(train0))
        print('#sents of training file 1:', len(train1))

        if not os.path.isfile(args.vocab):
            build_vocab(train0 + train1, args.vocab)

    vocab = Vocabulary(args.vocab, args.embedding, args.dim_emb)
    print('vocabulary size:', vocab.size)

    if args.dev or args.latent_dev:
        chosen = args.dev if len(args.dev) > len(args.latent_dev) else \
          args.latent_dev
        dev0 = load_sent(chosen + 'formal')
        dev1 = load_sent(chosen + 'informal')

    if args.test or args.latent_test:
        chosen = args.test if len(args.test) > len(args.latent_test) else \
          args.latent_test
        test0 = load_sent(chosen + 'formal')
        test1 = load_sent(chosen + 'informal')

    # get condifg object and set dynamic memory aloc