示例#1
0
def text_precess(train_text_loc,
                 test_text_loc=None,
                 oracle_file=None) -> Tuple[int, int, dict, dict]:
    """
    Get sequence length and dict size plus word_index dict and index_word.
    This is done because it is long to compute it in large corpora \n
    :param train_text_loc: train file
    :param test_text_loc: test file
    :return: sequence length of the longest sentences, dict size (how many different words), dict from word to index
    """

    json_dict_wi_file = train_text_loc[:-4] + "_json_dict_wi.txt"
    json_dict_iw_file = train_text_loc[:-4] + "_json_dict_iw.txt"
    tokens_file = train_text_loc[:-4] + "_toekns.txt"

    try:
        word_index_dict = load_json(json_dict_wi_file)
        index_word_dict = load_json(json_dict_iw_file)
        (train_tokens, test_tokens) = load_pickle(tokens_file)

    except FileNotFoundError:
        # compute dictionaries and save them
        train_tokens = get_tokenized(train_text_loc)
        if test_text_loc is None:
            test_tokens = list()
        else:
            test_tokens = get_tokenized(test_text_loc)
        word_set = get_word_list(train_tokens + test_tokens)
        [word_index_dict, index_word_dict] = get_dict(word_set)
        write_json(json_dict_wi_file, word_index_dict)
        write_json(json_dict_iw_file, index_word_dict)
        write_pickle(tokens_file, train_tokens, test_tokens)

    if test_text_loc is None:
        sequence_len = len(max(train_tokens, key=len))
    else:
        sequence_len = max(len(max(train_tokens, key=len)),
                           len(max(test_tokens, key=len)))

    if oracle_file:
        try:
            fh = open(oracle_file, 'r')
            # Store configuration file values
        except FileNotFoundError:
            with open(oracle_file, 'w') as outfile:
                outfile.write(
                    text_to_code(
                        train_tokens  # + test_tokens
                        ,
                        word_index_dict,
                        sequence_len))

    return sequence_len, len(
        word_index_dict) + 1, word_index_dict, index_word_dict
示例#2
0
    def get_sentences(self):
        generated_sentences = []
        data = load_json(self.json_file)
        for el in data['sentences']:
            generated_sentences.append(el['generated_sentence'])

        return generated_sentences
示例#3
0
    def get_reference(self):
        reference = list()
        json_obj = load_json(self.test_data)
        for i, hypothesis in enumerate(json_obj['sentences']):
            text = nltk.word_tokenize(hypothesis['generated_sentence'])
            reference.append(text)
        len_ref = len(reference)

        return reference[:self.sample_size]
示例#4
0
    def get_score(self):
        ngram = self.gram
        bleu = list()
        weight = tuple((1. / ngram for _ in range(ngram)))
        json_senteces = load_json(self.json_file)
        for ind, sentences in enumerate(json_senteces['sentences']):
            if ind > self.sample_size:
                break
            generated, ground_through = sentences['generated_sentence'], sentences['real_starting']
            bleu.append(calc_bleu(ground_through, generated, weight))

        return sum(bleu) / len(bleu)
示例#5
0
    def computeDistanceJaccard(self):
        jaccard_values = []

        data = load_json(self.json_file)
        for el in data['sentences']:
            generated_sentence = el['generated_sentence']
            values = []
            for real_sent in self.all_sentences:
                values.append(distJaccard(generated_sentence, real_sent))

            jaccard_values.append(1 - max(values))

        return np.mean(jaccard_values)
示例#6
0
 def get_bleu(self):
     ngram = self.gram
     bleu = list()
     reference = self.get_reference()
     weight = tuple((1. / ngram for _ in range(ngram)))
     json_obj = load_json(self.test_data)
     t = time.time()
     # for i, hypothesis in enumerate(json_obj['sentences']):
     #     hypothesis = hypothesis['generated_sentence']
     #     if i >= self.sample_size:
     #         break
     #     hypothesis = nltk.word_tokenize(hypothesis)
     #     bleu.append(calc_bleu(reference, hypothesis, weight))
     #     i += 1
     # return sum(bleu) / len(bleu)
     json_cropped = json_obj['sentences'][:self.sample_size]
     # we can swap out ProcessPoolExecutor for ThreadPoolExecutor
     with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
         for bleu_res in executor.map(procedure, json_cropped, repeat(reference), repeat(weight)):
             bleu.append(bleu_res)
     # print("Bleu executed in {}".format(time.time() - t))
     return sum(bleu) / len(bleu)
示例#7
0
def main():
    args = parser.parse_args()
    pp.pprint(vars(args))
    config = vars(args)

    # train with different datasets
    if args.dataset == 'oracle':
        oracle_model = OracleLstm(num_vocabulary=args.vocab_size,
                                  batch_size=args.batch_size,
                                  emb_dim=args.gen_emb_dim,
                                  hidden_dim=args.hidden_dim,
                                  sequence_length=args.seq_len,
                                  start_token=args.start_token)
        oracle_loader = OracleDataLoader(args.batch_size, args.seq_len)
        gen_loader = OracleDataLoader(args.batch_size, args.seq_len)

        generator = models.get_generator(args.g_architecture,
                                         vocab_size=args.vocab_size,
                                         batch_size=args.batch_size,
                                         seq_len=args.seq_len,
                                         gen_emb_dim=args.gen_emb_dim,
                                         mem_slots=args.mem_slots,
                                         head_size=args.head_size,
                                         num_heads=args.num_heads,
                                         hidden_dim=args.hidden_dim,
                                         start_token=args.start_token)
        discriminator = models.get_discriminator(args.d_architecture,
                                                 batch_size=args.batch_size,
                                                 seq_len=args.seq_len,
                                                 vocab_size=args.vocab_size,
                                                 dis_emb_dim=args.dis_emb_dim,
                                                 num_rep=args.num_rep,
                                                 sn=args.sn)
        oracle_train(generator, discriminator, oracle_model, oracle_loader,
                     gen_loader, config)

    elif args.dataset in ['image_coco', 'emnlp_news']:
        # custom dataset selected
        data_file = resources_path(args.data_dir,
                                   '{}.txt'.format(args.dataset))
        sample_dir = resources_path(config['sample_dir'])
        oracle_file = os.path.join(sample_dir,
                                   'oracle_{}.txt'.format(args.dataset))

        data_dir = resources_path(config['data_dir'])
        if args.dataset == 'image_coco':
            test_file = os.path.join(data_dir, 'testdata/test_coco.txt')
        elif args.dataset == 'emnlp_news':
            test_file = os.path.join(data_dir, 'testdata/test_emnlp.txt')
        else:
            raise NotImplementedError('Unknown dataset!')

        if args.dataset == 'emnlp_news':
            data_file, lda_file = create_subsample_data_file(data_file)
        else:
            lda_file = data_file

        seq_len, vocab_size, word_index_dict, index_word_dict = text_precess(
            data_file, test_file, oracle_file=oracle_file)
        config['seq_len'] = seq_len
        config['vocab_size'] = vocab_size
        print('seq_len: %d, vocab_size: %d' % (seq_len, vocab_size))

        config['topic_loss_weight'] = args.topic_loss_weight

        if config['LSTM']:
            if config['topic']:
                topic_number = config['topic_number']
                oracle_loader = RealDataTopicLoader(args.batch_size,
                                                    args.seq_len)
                oracle_loader.set_dataset(args.dataset)
                oracle_loader.set_files(data_file, lda_file)
                oracle_loader.topic_num = topic_number
                oracle_loader.set_dictionaries(word_index_dict,
                                               index_word_dict)

                generator = models.get_generator(
                    args.g_architecture,
                    vocab_size=vocab_size,
                    batch_size=args.batch_size,
                    seq_len=seq_len,
                    gen_emb_dim=args.gen_emb_dim,
                    mem_slots=args.mem_slots,
                    head_size=args.head_size,
                    num_heads=args.num_heads,
                    hidden_dim=args.hidden_dim,
                    start_token=args.start_token,
                    TopicInMemory=args.topic_in_memory,
                    NoTopic=args.no_topic)

                from real.real_gan.real_topic_train_NoDiscr import real_topic_train_NoDiscr
                real_topic_train_NoDiscr(generator, oracle_loader, config,
                                         args)
            else:
                generator = models.get_generator(args.g_architecture,
                                                 vocab_size=vocab_size,
                                                 batch_size=args.batch_size,
                                                 seq_len=seq_len,
                                                 gen_emb_dim=args.gen_emb_dim,
                                                 mem_slots=args.mem_slots,
                                                 head_size=args.head_size,
                                                 num_heads=args.num_heads,
                                                 hidden_dim=args.hidden_dim,
                                                 start_token=args.start_token)

                oracle_loader = RealDataLoader(args.batch_size, args.seq_len)
                oracle_loader.set_dictionaries(word_index_dict,
                                               index_word_dict)
                oracle_loader.set_dataset(args.dataset)
                oracle_loader.set_files(data_file, lda_file)
                oracle_loader.topic_num = config['topic_number']

                from real.real_gan.real_train_NoDiscr import real_train_NoDiscr
                real_train_NoDiscr(generator, oracle_loader, config, args)
        else:
            if config['topic']:
                topic_number = config['topic_number']
                oracle_loader = RealDataTopicLoader(args.batch_size,
                                                    args.seq_len)
                oracle_loader.set_dataset(args.dataset)
                oracle_loader.set_files(data_file, lda_file)
                oracle_loader.topic_num = topic_number
                oracle_loader.set_dictionaries(word_index_dict,
                                               index_word_dict)

                generator = models.get_generator(
                    args.g_architecture,
                    vocab_size=vocab_size,
                    batch_size=args.batch_size,
                    seq_len=seq_len,
                    gen_emb_dim=args.gen_emb_dim,
                    mem_slots=args.mem_slots,
                    head_size=args.head_size,
                    num_heads=args.num_heads,
                    hidden_dim=args.hidden_dim,
                    start_token=args.start_token,
                    TopicInMemory=args.topic_in_memory,
                    NoTopic=args.no_topic)

                discriminator = models.get_discriminator(
                    args.d_architecture,
                    batch_size=args.batch_size,
                    seq_len=seq_len,
                    vocab_size=vocab_size,
                    dis_emb_dim=args.dis_emb_dim,
                    num_rep=args.num_rep,
                    sn=args.sn)

                if not args.no_topic:
                    topic_discriminator = models.get_topic_discriminator(
                        args.topic_architecture,
                        batch_size=args.batch_size,
                        seq_len=seq_len,
                        vocab_size=vocab_size,
                        dis_emb_dim=args.dis_emb_dim,
                        num_rep=args.num_rep,
                        sn=args.sn,
                        discriminator=discriminator)
                else:
                    topic_discriminator = None
                from real.real_gan.real_topic_train import real_topic_train
                real_topic_train(generator, discriminator, topic_discriminator,
                                 oracle_loader, config, args)
            else:

                generator = models.get_generator(args.g_architecture,
                                                 vocab_size=vocab_size,
                                                 batch_size=args.batch_size,
                                                 seq_len=seq_len,
                                                 gen_emb_dim=args.gen_emb_dim,
                                                 mem_slots=args.mem_slots,
                                                 head_size=args.head_size,
                                                 num_heads=args.num_heads,
                                                 hidden_dim=args.hidden_dim,
                                                 start_token=args.start_token)

                discriminator = models.get_discriminator(
                    args.d_architecture,
                    batch_size=args.batch_size,
                    seq_len=seq_len,
                    vocab_size=vocab_size,
                    dis_emb_dim=args.dis_emb_dim,
                    num_rep=args.num_rep,
                    sn=args.sn)

                oracle_loader = RealDataLoader(args.batch_size, args.seq_len)

                from real.real_gan.real_train import real_train
                real_train(generator, discriminator, oracle_loader, config,
                           args)

    elif args.dataset in ['Amazon_Attribute']:
        # custom dataset selected
        data_dir = resources_path(config['data_dir'], "Amazon_Attribute")
        sample_dir = resources_path(config['sample_dir'])
        oracle_file = os.path.join(sample_dir,
                                   'oracle_{}.txt'.format(args.dataset))
        train_file = os.path.join(data_dir, 'train.csv')
        dev_file = os.path.join(data_dir, 'dev.csv')
        test_file = os.path.join(data_dir, 'test.csv')

        # create_tokens_files(data_files=[train_file, dev_file, test_file])
        config_file = load_json(os.path.join(data_dir, 'config.json'))
        config = {**config, **config_file}  # merge dictionaries

        from real.real_gan.loaders.amazon_loader import RealDataAmazonLoader
        oracle_loader = RealDataAmazonLoader(args.batch_size, args.seq_len)
        oracle_loader.create_batches(
            data_file=[train_file, dev_file, test_file])
        oracle_loader.model_index_word_dict = load_json(
            join(data_dir, 'index_word_dict.json'))
        oracle_loader.model_word_index_dict = load_json(
            join(data_dir, 'word_index_dict.json'))

        generator = models.get_generator("amazon_attribute",
                                         vocab_size=config['vocabulary_size'],
                                         batch_size=args.batch_size,
                                         seq_len=config['seq_len'],
                                         gen_emb_dim=args.gen_emb_dim,
                                         mem_slots=args.mem_slots,
                                         head_size=args.head_size,
                                         num_heads=args.num_heads,
                                         hidden_dim=args.hidden_dim,
                                         start_token=args.start_token,
                                         user_num=config['user_num'],
                                         product_num=config['product_num'],
                                         rating_num=5)

        discriminator = models.get_discriminator(
            "amazon_attribute",
            batch_size=args.batch_size,
            seq_len=config['seq_len'],
            vocab_size=config['vocabulary_size'],
            dis_emb_dim=args.dis_emb_dim,
            num_rep=args.num_rep,
            sn=args.sn)

        from real.real_gan.amazon_attribute_train import amazon_attribute_train
        amazon_attribute_train(generator, discriminator, oracle_loader, config,
                               args)
    elif args.dataset in ['CustomerReviews', 'imdb']:
        from real.real_gan.loaders.custom_reviews_loader import RealDataCustomerReviewsLoader
        from real.real_gan.customer_reviews_train import customer_reviews_train
        # custom dataset selected
        if args.dataset == 'CustomerReviews':
            data_dir = resources_path(config['data_dir'], "MovieReviews", "cr")
        elif args.dataset == 'imdb':
            data_dir = resources_path(config['data_dir'], "MovieReviews",
                                      'movie', 'sstb')
        else:
            raise ValueError
        sample_dir = resources_path(config['sample_dir'])
        oracle_file = os.path.join(sample_dir,
                                   'oracle_{}.txt'.format(args.dataset))
        train_file = os.path.join(data_dir, 'train.csv')

        # create_tokens_files(data_files=[train_file, dev_file, test_file])
        config_file = load_json(os.path.join(data_dir, 'config.json'))
        config = {**config, **config_file}  # merge dictionaries

        oracle_loader = RealDataCustomerReviewsLoader(args.batch_size,
                                                      args.seq_len)
        oracle_loader.create_batches(data_file=[train_file])
        oracle_loader.model_index_word_dict = load_json(
            join(data_dir, 'index_word_dict.json'))
        oracle_loader.model_word_index_dict = load_json(
            join(data_dir, 'word_index_dict.json'))

        generator = models.get_generator("CustomerReviews",
                                         vocab_size=config['vocabulary_size'],
                                         batch_size=args.batch_size,
                                         start_token=args.start_token,
                                         seq_len=config['seq_len'],
                                         gen_emb_dim=args.gen_emb_dim,
                                         mem_slots=args.mem_slots,
                                         head_size=args.head_size,
                                         num_heads=args.num_heads,
                                         hidden_dim=args.hidden_dim,
                                         sentiment_num=config['sentiment_num'])

        discriminator_positive = models.get_discriminator(
            "CustomerReviews",
            scope="discriminator_positive",
            batch_size=args.batch_size,
            seq_len=config['seq_len'],
            vocab_size=config['vocabulary_size'],
            dis_emb_dim=args.dis_emb_dim,
            num_rep=args.num_rep,
            sn=args.sn)

        discriminator_negative = models.get_discriminator(
            "CustomerReviews",
            scope="discriminator_negative",
            batch_size=args.batch_size,
            seq_len=config['seq_len'],
            vocab_size=config['vocabulary_size'],
            dis_emb_dim=args.dis_emb_dim,
            num_rep=args.num_rep,
            sn=args.sn)

        customer_reviews_train(generator, discriminator_positive,
                               discriminator_negative, oracle_loader, config,
                               args)
    else:
        raise NotImplementedError('{}: unknown dataset!'.format(args.dataset))

    print("RUN FINISHED")
    return