示例#1
0
def find_gun(idx):
    dl = Data_loader(labeled_only=True)
    if idx is None:
        for idx in range(100, 200):
            print(idx, dl.convert2unicode([idx]))
    else:
        print(idx, dl.convert2unicode([idx]))
示例#2
0
def get_pair_to_tids():
    print('Initializing Data Loader...')
    dl = Data_loader()
    test_ids = [tweet['tweet_id'] for tweet in dl.test_data()]
    pair2tids = {}
    for record in dl.all_data():
        if record['tweet_id'] not in test_ids:
            involved = set()
            involved.add(record['user_post'])
            if 'user_retweet' in record:
                involved.add(record['user_retweet'])
            if 'user_mentions' in record:
                for user in record['user_mentions']:
                    involved.add(user)
            involved = sorted(list(involved))

            for i, u1 in enumerate(involved):
                for u2 in involved[i + 1:]:
                    pair_id = str(u1) + '_' + str(u2)
                    if pair_id in pair2tids:
                        pair2tids[pair_id].append(record['tweet_id'])
                    else:
                        pair2tids[pair_id] = [record['tweet_id']]

    return pair2tids
示例#3
0
def make_word_emb_for_nn(extension):
    size = 300
    window = 5
    min_count = 5
    epochs = 20
    w2v_file = '../data/{0}_w2v_s{1}_w{2}_mc{3}_ep{4}.bin'.format(
        extension, size, window, min_count, epochs)
    wv = KeyedVectors.load_word2vec_format(w2v_file, binary=True)
    print('Number of embeddings in {}: {}'.format(w2v_file, len(wv.vocab)))

    unicode2idx_pkl = 'unicode2idx_' + extension + '.pkl'
    unicode2idx = pickle.load(open(unicode2idx_pkl, 'rb'))  # complete vocab
    print('Size of complete vocab:', len(unicode2idx))

    dl = Data_loader(labeled_only=True)
    vocab_size = 40000
    dim = 300
    embeds = np.zeros((vocab_size, dim), dtype=np.float)
    embeds[1] = np.random.uniform(-0.25, 0.25, dim)
    not_in_vocab = 0
    not_in_w2v = 0
    unknown_idx = set()
    avg_vocab = np.zeros(dim)
    known_vocab = 0
    for dl_idx in range(2, vocab_size):
        unicode = dl.convert2unicode([dl_idx]).encode('utf-8')
        if unicode in unicode2idx:
            ext_idx = unicode2idx[unicode]
            if str(ext_idx) in wv.vocab:
                known_vocab += 1
                embeds[dl_idx] = wv[str(ext_idx)]
                avg_vocab += wv[str(ext_idx)]
            else:
                #this word is in the training corpus of the pretrained embedding but is thrown away
                #because its frequency does not reach min_count = 5
                not_in_w2v += 1
                unknown_idx.add(dl_idx)
                #embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim)
        else:
            #this word is not even in the training corpus of the pretrained embedding
            not_in_vocab += 1
            unknown_idx.add(dl_idx)
            #embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim)

    #assign unknown vocabs to average of known vocabs
    avg_vocab /= known_vocab
    for unk_idx in unknown_idx:
        embeds[unk_idx] = avg_vocab

    print(not_in_vocab, 'not in vocab')
    print(not_in_w2v, 'not in word2vec (min_count=5)')
    missed = not_in_vocab + not_in_w2v
    print('Total: got {} embeddings, missed {}, out of {}'.format(
        vocab_size - missed, missed, vocab_size))

    save_file = 'word_emb_' + extension + '.np'
    np.savetxt(save_file, embeds)  #embeds is final embedding by idx
    print('Saved embeddings in', save_file)
示例#4
0
 def __init__(self, dataset='labeled'):
     bilm_args = pkl.load(
         open('../experiments/ELMo_weights/4-23-9pm.param', 'rb'))
     bilm_args['experiment_path'] = 'ELMo_weights/4-23-9pm'
     self.bilm = create_bilm_from_args(bilm_args)
     self.dataset = dataset
     if dataset == 'labeled':
         self.dl = Data_loader(labeled_only=True, option='both')
     else:
         self.dl = Data_loader(labeled_only=False, option='both')
示例#5
0
文件: seq_gan.py 项目: zixzeus/seqgan
    def __init__(self):
        #########################################################################################
        #  Generator  Hyper-parameters
        #########################################################################################
        self.PRE_EMB_DIM = 32
        self.PRE_HIDDEN_DIM = 32
        self.SEQ_LENGTH = 64
        self.PRE_START_TOKEN = 0

        # self.PRE_EMB_DIM = 16
        # self.PRE_HIDDEN_DIM = 32
        # self.SEQ_LENGTH = 64

        self.PRE_EPOCH_NUM = 1

        self.PRE_TRAIN_ITER = 1  # generator
        self.PRE_SEED = 88

        self.batch_size = 16
        ##########################################################################################

        self.TOTAL_BATCH = 300
        # TOTAL_BATCH = 800

        #########################################################################################
        #  Discriminator  Hyper-parameters
        #########################################################################################
        self.dis_embedding_dim = 64
        self.dis_filter_sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20]
        self.dis_num_filters = [
            100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160
        ]
        self.dis_dropout_keep_prob = 0.75
        self.dis_l2_reg_lambda = 0.2

        # Training parameters
        # self.dis_num_epochs = 20
        self.dis_num_epochs = 1

        # dis_alter_epoch = 50
        self.dis_alter_epoch = 25

        self.positive_file = 'save/midi_trans.pkl'
        self.negative_file = 'target_generate/pretrain_small.pkl'
        # eval_file = 'target_generate/midi_trans_eval.pkl'

        self.generated_num = 40

        self.melody_size = 68
        self.RL_update_rate = 0.8

        self.data_loader = Data_loader()

        self.positive_x, self.positive_y = self.data_loader.load_data(
            self.positive_file, self.batch_size)
示例#6
0
def visualize_labeled_dataset():
    print('Initializing Data Loader')
    dl = Data_loader()

    tr, val, tst = dl.cv_data(fold_idx=0)
    labeled_tweets = tr + val + tst
    labeled_tweets = [(x['tweet_id'], x['label']) for x in labeled_tweets]
    print('Number of labeled tweets:', len(labeled_tweets))

    # plot_tweets(labeled_tweets, emb_type='splex', rep_mode='sum', include_sub=False, force_TSNE=True)
    plot_tweets(labeled_tweets, emb_type='w2v', rep_mode='avg')
示例#7
0
    def __init__(self, mode):
        assert mode in ['train', 'eval']

        if mode == 'train':
            dl = Data_loader(labeled_only=True, option='both')
            self.train_test_val_data = dl.cv_data(0)[0] + dl.cv_data(
                0)[1] + dl.cv_data(0)[2]
            self.train()
        else:
            model_dict = pkl.load(open("../data/logistic_regression.pkl",
                                       'rb'))
            self.thresholds = model_dict['thresholds']
            self.classifiers = model_dict['models']
class Texture_dataset_val(Dataset):
    def __init__(self, data_size, textures_path, max_region=10):
        self.data_size = data_size
        self.data = Data_loader(textures_path, 1, max_region)
        self.preload = []
        for i in range(self.data_size):
            x, y, x_ref = self.data.get_batch_data()
            x = x[0]
            y = y[0]
            x_ref = x_ref[0]
            x = np.swapaxes(x, 1, 2)
            x = np.swapaxes(x, 0, 1)
            y = np.swapaxes(y, 1, 2)
            y = np.swapaxes(y, 0, 1)
            x_ref = np.swapaxes(x_ref, 1, 2)
            x_ref = np.swapaxes(x_ref, 0, 1)
            x, y, x_ref = x.astype('float32'), y.astype(
                'float32'), x_ref.astype('float32')
            self.preload.append((x, y, x_ref))

    def __len__(self):
        return self.data_size

    def __getitem__(self, idx):
        return self.preload[idx]
示例#9
0
def add_context_level_inputs(all_inputs, labeled_tweets, emb_to_sizes):
    print('Pre-popping cl:', len(all_inputs))
    no_cl_inputs = {}
    for input_name, np in all_inputs.items():
        if not input_name.endswith('cl'):
            no_cl_inputs[input_name] = np
    all_inputs = no_cl_inputs
    print('Post-popping cl:', len(all_inputs))

    sorted_tids = sorted([tweet['tweet_id'] for tweet in labeled_tweets])
    print('Initializing complete Data Loader...')
    complete_dl = Data_loader()
    tweet_dict, user_ct_tweets, id_to_location = None, None, None
    for emb_type in emb_to_sizes:
        for size in emb_to_sizes[emb_type]:
            cl = init_context(emb_type,
                              size,
                              complete_dl,
                              tweet_dict=tweet_dict,
                              user_ct_tweets=user_ct_tweets,
                              id_to_location=id_to_location)
            combine_modes = ['avg'] if emb_type == 'w2v' else ['sum']
            sorted_reps = [
                cl.get_representation(tid, modes=combine_modes)
                for tid in sorted_tids
            ]
            sorted_reps = StandardScaler().fit_transform(sorted_reps)
            all_inputs['{}_{}_cl'.format(emb_type, str(size))] = dict(
                (sorted_tids[i], sorted_reps[i])
                for i in range(len(sorted_tids)))
            if tweet_dict is None:
                tweet_dict, user_ct_tweets, id_to_location = cl.get_params()
    print('Post-adding cl:', len(all_inputs))
示例#10
0
def main(args):
    # params for data loader
    option = args['option']
    print('Initializing Data Loader')
    dl = Data_loader(option=option)
    all_data = dl.all_data()
    print('Len of all data:', len(all_data))
    test_ids = set([tweet['tweet_id'] for tweet in dl.test_data()])
    print('Len of test data:', len(test_ids))
    ensemble_ids = get_ensemble_tids()
    print('Len of ensemble data:', len(ensemble_ids))

    mode = args['mode']
    assert (mode == 'w2v' or mode == 'svd' or mode == 'd2v')
    if mode == 'w2v':
        sentences = []
        for tweet in all_data:
            # need indices split
            if tweet['tweet_id'] not in test_ids and tweet[
                    'tweet_id'] not in ensemble_ids:
                sentences.append([str(x) for x in tweet['int_arr']])
        print('Num sentences:', len(sentences))
        print('Check sentence0:', sentences[0])
        generate_w2v_embs(sentences, option)
    elif mode == 'svd':
        sentences = []
        for i, tweet in enumerate(all_data):
            # need indices joined
            if tweet['tweet_id'] not in test_ids and tweet[
                    'tweet_id'] not in ensemble_ids:
                sentences.append(' '.join([str(x) for x in tweet['int_arr']]))
        print('Num sentences:', len(sentences))
        print('Check sentence0:', sentences[0])
        generate_svd_embs(sentences, option)
    else:  # mode == d2v
        sentences = []
        tags = []
        for tweet in all_data:
            if tweet['tweet_id'] not in test_ids and tweet[
                    'tweet_id'] not in ensemble_ids:
                # need indices split and use id's as tags
                sentences.append([str(x) for x in tweet['int_arr']])
                tags.append([str(tweet['tweet_id'])])
        print('Num sentences:', len(sentences))
        print('Check sentence0:', sentences[0])
        print('Check tag0:', tags[0])
        generate_d2v_embs(sentences, tags, option)
示例#11
0
文件: LIME.py 项目: yandachen/GI_2019
    def __init__(self,
                 model_predict,
                 model_threshold,
                 output_dir,
                 input_format,
                 tweet_records,
                 truth_label,
                 pad_elmo=False,
                 unigram_observe_ids=None):
        #model_predict is a function the takes X and evaluates the score, abstracted to keep LIME decoupled from model
        #architecture, input format and use of context features.
        self.dl = Data_loader(labeled_only=True, option='both')
        self.model_predict = model_predict
        self.model_threshold = model_threshold
        self.output_dir = output_dir
        self.input_format = input_format
        self.pad_elmo = pad_elmo
        self.unigram_observe_ids = unigram_observe_ids

        self.tweet_records, self.truth_label = tweet_records, truth_label
        self.scores = self.model_predict(self.tweet_records).flatten()
        self.label_prediction = [
            1 if self.scores[idx] >= self.model_threshold else 0
            for idx in range(len(self.scores))
        ]
        idx_considered = [
            idx for idx in range(len(self.label_prediction))
            if self.label_prediction[idx] == 1
        ]
        self.tweet_id_considered = [
            self.tweet_records['tweet_id'][idx] for idx in idx_considered
        ]
        included_tweet_records = {}

        for key in self.tweet_records.keys():
            if key == 'word_content_input_elmo' and pad_elmo is False:
                included_tweet_records[key] = [
                    self.tweet_records[key][idx] for idx in idx_considered
                ]
            else:
                included_tweet_records[key] = np.array(
                    [self.tweet_records[key][idx] for idx in idx_considered])

        self.tweet_records = included_tweet_records
        self.scores = np.array([self.scores[idx] for idx in idx_considered])
示例#12
0
def make_word_emb_for_nn(extension):
    size = 300
    window = 5
    min_count = 5
    epochs = 20
    w2v_file = '../data/{0}_w2v_s{1}_w{2}_mc{3}_ep{4}.bin'.format(
        extension, size, window, min_count, epochs)
    wv = KeyedVectors.load_word2vec_format(w2v_file, binary=True)
    print('Number of embeddings in {}: {}'.format(w2v_file, len(wv.vocab)))

    unicode2idx_pkl = 'unicode2idx_' + extension + '.pkl'
    unicode2idx = pickle.load(open(unicode2idx_pkl, 'rb'))  # complete vocab
    print('Size of complete vocab:', len(unicode2idx))

    dl = Data_loader(labeled_only=True)
    vocab_size = 40000
    dim = 300
    embeds = np.zeros((vocab_size, dim), dtype=np.float)
    embeds[1] = np.random.uniform(-0.25, 0.25, dim)
    not_in_vocab = 0
    not_in_w2v = 0
    for dl_idx in range(2, vocab_size):
        unicode = dl.convert2unicode([dl_idx]).encode('utf-8')
        if unicode in unicode2idx:
            ext_idx = unicode2idx[unicode]
            if str(ext_idx) in wv.vocab:
                embeds[dl_idx] = wv[str(ext_idx)]
            else:
                not_in_w2v += 1
                embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim)
        else:
            not_in_vocab += 1
            embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim)
    print(not_in_vocab, 'not in vocab')
    print(not_in_w2v, 'not in word2vec (min_count=5)')
    missed = not_in_vocab + not_in_w2v
    print('Total: got {} embeddings, missed {}, out of {}'.format(
        vocab_size - missed, missed, vocab_size))

    save_file = 'word_emb_' + extension + '.np'
    np.savetxt(save_file, embeds)
    print('Saved embeddings in', save_file)
def main(_):
    pp.pprint(flags.FLAGS.__flags)
    with tf.Session() as sess:
        data_loader = Data_loader(FLAGS.embedding_file, FLAGS.embedding_size)
        q_network = Q_network(sess, FLAGS.embedding_size, FLAGS.step_size,
                              FLAGS.target_frequency, FLAGS.hidden_units,
                              FLAGS.final_units, FLAGS.greedy_ratio,
                              data_loader)
        replay = Replay(q_network, FLAGS.minibatch_size, FLAGS.replay_size)
        model = DQL(FLAGS.budget, data_loader, q_network, replay)
        model.run()
示例#14
0
 def load(self):
     transform = transforms.Compose([
         transforms.Resize(28),
         transforms.ToTensor(),
         transforms.Normalize(mean=[0.5], std=[0.5]),
     ])
     Ominiglot = Data_loader(path=self.path,
                             train=True,
                             transform=transform)
     characters = Ominiglot.characters
     return characters
示例#15
0
    def __init__(self, batch_size, image_size, lr, epoch):

        self.input_images = tf.placeholder(tf.float32, [None, image_size, image_size, 3])
        self.pos_images = tf.placeholder(tf.float32, [None, image_size, image_size, 3])
        self.neg_images = tf.placeholder(tf.float32, [None, image_size, image_size, 3])
        # sparse label
        self.input_cate = tf.placeholder(tf.int32, [None])
        self.input_attr = tf.placeholder(tf.float32, [None, 1000])
        self.num_cate = 50
        self.num_attr = 1000
        self.dropout_keep_prob = tf.placeholder(tf.float32)
        self.g_step = tf.Variable(0)
        self.lr = tf.train.exponential_decay(lr, self.g_step, 50000, 0.98)
        self.batch_size = batch_size
        self.image_size = image_size
        self.max_epoch = epoch
        self.d_loader = Data_loader(root, cate_path, attr_path, partition_path, self.batch_size, image_size)

        self.trainX, self.train_pos, self.train_neg, self.trainY1, self.trainY2 = self.d_loader.get_queue(1000, 'train', epoch, True)
        self.valX, self.valY1, self.valY2 = self.d_loader.get_queue(1000, 'val', None, False)
        self.testX, self.testY1, self.testY2 = self.d_loader.get_queue(1000, 'test', None, False)
示例#16
0
def check_splex_top_k(mode, k=100, print_top=True):
    assert(mode == 'loss' or mode == 'agg' or mode == 'sub')
    splex = pickle.load(open('../data/splex_minmax_svd_word_s300_seeds_hc.pkl', 'rb'))
    if mode == 'loss':
        mode_idx = 0
    elif mode == 'agg':
        mode_idx = 1
    else:
        mode_idx = 2

    tuples = [(k, splex[k][mode_idx]) for k in splex]
    tuples = sorted(tuples, key=lambda x: x[1], reverse=True)

    if print_top:
        dl = Data_loader(labeled_only=True)
        row_format = '{:<7}' * 2 + '{:<15}' * 2
        print(row_format.format('Rank', 'Index', 'Unicode', 'SPLex {} Score (minmax scaling)'.format(mode.capitalize())))
        for rank, (idx, score) in enumerate(tuples[:k]):
            print(row_format.format(rank, idx, dl.convert2unicode([int(idx)]), round(score, 5)))

    return tuples[:k]
示例#17
0
def add_inputs():
    save_file = 'all_inputs.pkl'
    all_inputs = pickle.load(open(save_file, 'rb'))

    print('Initializing labeled Data Loader...')
    labeled_dl = Data_loader(labeled_only=True)
    labeled_tweets = labeled_dl.all_data()

    # TIME INPUT
    # add_time_input(all_inputs)
    # print('Added time input, shape =', np.array(list(all_inputs['time'].values())).shape)

    # TWEET-LEVEL INPUTS
    # add_tweet_level_input(all_inputs, labeled_tweets, emb_type='splex')
    # print('Added splex_tl input, shape =', np.array(list(all_inputs['splex_tl'].values())).shape)

    # CONTEXT-LEVEL INPUTS
    # emb_to_sizes = {'w2v': [30, 60], 'splex': [2, 30]}
    # add_context_level_inputs(all_inputs, labeled_tweets, emb_to_sizes=emb_to_sizes)
    # print('Added context inputs')
    # print('w2v shape =', np.array(list(all_inputs['30_w2v_cl'].values())).shape)
    # print('splex shape =', np.array(list(all_inputs['2_splex_cl'].values())).shape)

    # USER INPUTS
    # add_user_inputs(all_inputs, labeled_tweets, num_users=300)
    # add_user_inputs(all_inputs, labeled_tweets, num_users=50)
    # print('Added user inputs: 50 users shape =', np.array(list(all_inputs['50_post_user_index'].values())).shape)

    # # PAIRWISE INPUT
    # add_pairwise_input(all_inputs, labeled_tweets, cutoff=1)
    # add_pairwise_input(all_inputs, labeled_tweets, cutoff=2)
    # add_pairwise_input(all_inputs, labeled_tweets, cutoff=3)
    # print('Added pairwise inputs')
    # print('splex shape =', np.array(list(all_inputs['pairwise_c1_splex'].values())).shape)
    # print('w2v shape =', np.array(list(all_inputs['pairwise_c1_w2v'].values())).shape)

    pickle.dump(all_inputs, open(save_file, 'wb'))
    print('Saved', save_file)
示例#18
0
def make_user_embeds(num_users):
    dim = 300
    embeds = np.random.rand(num_users, dim)

    print('Initializing Data Loader...')
    dl = Data_loader()
    tl = init_tl('w2v')
    test_ids = [tweet['tweet_id'] for tweet in dl.test_data()]
    pretrained_count = 0
    for user_idx in range(
            2, num_users
    ):  # reserve 0 for padding (i.e. no user), 1 for unknown user
        tweet_dicts = dl.tweets_by_user(
            user_idx)  # all tweets WRITTEN by this user
        if tweet_dicts is not None and len(tweet_dicts) > 0:
            tweet_count = 0
            all_tweets_sum = np.zeros(dim, dtype=np.float)
            for tweet_dict in tweet_dicts:
                tid = tweet_dict['tweet_id']
                if tid not in test_ids:
                    tweet_count += 1
                    tweet_avg = tl.get_representation(tid, mode='avg')
                    all_tweets_sum += tweet_avg
            if tweet_count > 0:
                pretrained_count += 1
                all_tweets_avg = all_tweets_sum / tweet_count
                embeds[user_idx] = all_tweets_avg
    print('Found tweets for {} out of {} users'.format(pretrained_count,
                                                       num_users - 2))

    embeds = StandardScaler().fit_transform(embeds)  # mean 0, variance 1
    embeds[0] = np.zeros(dim)  # make sure padding is all 0's

    save_file = str(num_users) + '_user_emb.np'
    np.savetxt(save_file, embeds)
    print('Saved embeddings in', save_file)
示例#19
0
def generate(args):

    data_loader = Data_loader(batch_size=1,
                              bias_init=args.bias_init,
                              train=False)

    model = Model(wemb_dim=args.wemb_dim,
                  hid_dim=args.hid_dim,
                  seq_len=data_loader.maxlen + 1,
                  learning_rate=args.learning_rate,
                  batch_size=1,
                  num_batches=data_loader.num_batches,
                  num_words=data_loader.num_words,
                  biivector=data_loader.biivector,
                  use_gru=args.use_gru,
                  inference=True)
    model.build()

    saver = tf.train.Saver()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    if args.model_path is not None:
        print('Using model: {}'.format(args.model_path))
        saver.restore(sess, args.model_path)
    else:
        latest_ckpt = tf.train.latest_checkpoint(args.logdir)
        print(
            'Did not provide model path, using latest: {}'.format(latest_ckpt))
        saver.restore(sess, latest_ckpt)

    feat = extract_single(sess, args.img_path, cnn='vgg')

    feed_dict = {
        model.ctx_ph: feat.reshape(-1, model.ctx_dim[0], model.ctx_dim[1])
    }
    captions_ix = sess.run(model.output_argmax, feed_dict=feed_dict)
    captions_wd = [data_loader.ixtoword[x] for x in captions_ix]
    try:
        captions_wd = ' '.join(captions_wd[:captions_wd.index('.')])
    except ValueError:
        captions_wd = ' '.join(captions_wd)
    print(captions_wd)
    print('Sentence generated.')
class Texture_dataset_train(Dataset):
    def __init__(self, data_size, textures_path, max_region=10):
        self.data_size = data_size
        self.data = Data_loader(textures_path, 1, max_region)

    def __len__(self):
        return self.data_size

    def __getitem__(self, idx):
        x, y, x_ref = self.data.get_batch_data()
        x = x[0]
        y = y[0]
        x_ref = x_ref[0]
        x = np.swapaxes(x, 1, 2)
        x = np.swapaxes(x, 0, 1)
        y = np.swapaxes(y, 1, 2)
        y = np.swapaxes(y, 0, 1)
        x_ref = np.swapaxes(x_ref, 1, 2)
        x_ref = np.swapaxes(x_ref, 0, 1)
        x, y, x_ref = x.astype('float32'), y.astype('float32'), x_ref.astype(
            'float32')
        return x, y, x_ref
示例#21
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier

from data_loader import Data_loader

# Init data_loader
data_loader = Data_loader()
Xs_lsi, Ys_lsi, Xa_lsi, Ya_lsi = data_loader.load_LSI()

X, y = [], []
p = 2 * data_loader.num_topics

# Fetch all duplicate data
print('\n Duplicate data \n')
for q1, q2 in zip(Xs_lsi, Ys_lsi):
    try:
        q12 = np.concatenate((q1, q2), axis=0)
        q12 = q12.reshape(p)
        X.append(q12)
        y.append(1)
    except:
        pass
        #print(q1.shape)
        #print(q2.shape)

# Fetch all NON duplicate data
print('\n Non Duplicate data \n')
for q1, q2 in zip(Xa_lsi, Ya_lsi):
示例#22
0
class Adversarial_generator():
    def __init__(self, dataset='labeled'):
        bilm_args = pkl.load(
            open('../experiments/ELMo_weights/4-23-9pm.param', 'rb'))
        bilm_args['experiment_path'] = 'ELMo_weights/4-23-9pm'
        self.bilm = create_bilm_from_args(bilm_args)
        self.dataset = dataset
        if dataset == 'labeled':
            self.dl = Data_loader(labeled_only=True, option='both')
        else:
            self.dl = Data_loader(labeled_only=False, option='both')

    def compute_log_prob(self, sentences_int_arr):
        tokens = self.bilm.dg.transform_sentences(sentences_int_arr)
        loss = self.bilm.compute_loss_on_data(tokens)
        return -loss

    def sanity_check(self):
        # For each two adjacent tweets, switch the word on every positions and see if both tweets' log probability
        # decrease most of the time
        tweet_ids = list(self.dl.data['data'].keys())
        count_prob_decrease = 0  # number of times the revised sentence has lower probability than original sentence
        count_prob_increase = 0  # number of times the revised sentence has higher probability than original sentence
        prob_increase_samples = {}
        prob_increase_samples['original'] = []
        prob_increase_samples['revised'] = []
        prob_increase_samples['original score'] = []
        prob_increase_samples['revised score'] = []

        for idx in range(len(tweet_ids) - 1):
            tweet_id1 = tweet_ids[idx]
            tweet_id2 = tweet_ids[idx + 1]

            sentence1 = trim(
                self.dl.data['data'][tweet_id1]['word_padded_int_arr'])
            sentence2 = trim(
                self.dl.data['data'][tweet_id2]['word_padded_int_arr'])

            log_prob_sentence1 = self.compute_log_prob([sentence1])
            log_prob_sentence2 = self.compute_log_prob([sentence2])
            for word_idx in range(min(len(sentence1), len(sentence2))):
                # swap the two sentences word on this position
                sentence1[word_idx], sentence2[word_idx] = sentence2[
                    word_idx], sentence1[word_idx]
                log_prob_revised_sentence1 = self.compute_log_prob([sentence1])
                log_prob_revised_sentence2 = self.compute_log_prob([sentence2])
                if log_prob_revised_sentence1 <= log_prob_sentence1:
                    count_prob_decrease += 1
                else:
                    count_prob_increase += 1
                    prob_increase_samples['revised'].append(
                        self.dl.convert2unicode(sentence1))
                    prob_increase_samples['revised score'].append(
                        log_prob_revised_sentence1)
                    prob_increase_samples['original score'].append(
                        log_prob_sentence1)

                if log_prob_revised_sentence2 <= log_prob_sentence2:
                    count_prob_decrease += 1
                else:
                    count_prob_increase += 1
                    prob_increase_samples['revised'].append(
                        self.dl.convert2unicode(sentence2))
                    prob_increase_samples['revised score'].append(
                        log_prob_revised_sentence2)
                    prob_increase_samples['original score'].append(
                        log_prob_sentence2)

                # recover the original sentence
                sentence1[word_idx], sentence2[word_idx] = sentence2[
                    word_idx], sentence1[word_idx]
                if log_prob_revised_sentence1 > log_prob_sentence1:
                    prob_increase_samples['original'].append(
                        self.dl.convert2unicode(sentence1))
                if log_prob_revised_sentence2 > log_prob_sentence2:
                    prob_increase_samples['original'].append(
                        self.dl.convert2unicode(sentence2))

            if idx % 10 == 0:
                print("increase: ", count_prob_decrease)
                print("decrease: ", count_prob_increase)
            if idx > 100:
                break
        print("Probability decrease: ", count_prob_decrease)
        print("Probability increase: ", count_prob_increase)
        pd.DataFrame.from_dict(prob_increase_samples).to_csv(
            "../showable/ELMo_sanity_check.csv", index=False)

    def create_natural_sentences(self, mode, token, tweet_dicts):
        assert mode in ['insert', 'replace']
        token_id = self.dl.token2property[token.encode("utf-8")]['id']
        sentence_outputs = {}
        keys = [
            'original_sentence', 'generated_sentence', 'original_prob',
            'generated_prob', 'original_int_arr', 'generated_int_arr',
            'tweet_id'
        ]
        for key in keys:
            sentence_outputs[key] = []

        for tweet_id in tweet_dicts.keys():
            sentence = tweet_dicts[tweet_id]['word_padded_int_arr']
            num_words = sum([x != 0 for x in sentence])

            if mode == 'insert':
                if num_words == 50:  #already max length, cannot add more words
                    continue
                idx_range = range(num_words + 1)
            else:
                idx_range = range(num_words)

            sentence_outputs['original_int_arr'].append(np.array(sentence))
            original_sentence_unicode = self.dl.convert2unicode(trim(sentence))
            sentence_outputs['original_sentence'].append(
                original_sentence_unicode)
            original_sentence_prob = self.compute_log_prob([trim(sentence)])
            sentence_outputs['original_prob'].append(original_sentence_prob)
            sentence_outputs['tweet_id'].append(tweet_id)

            max_generated_prob = -np.inf
            most_natural_generated_sentence = None

            for pos in idx_range:
                if mode == 'insert':
                    generated_sentence = insert_element(
                        sentence, pos, token_id)
                else:
                    generated_sentence = np.array(sentence)
                    generated_sentence[pos] = token_id

                new_sentence_prob = self.compute_log_prob(
                    [trim(generated_sentence)])
                if new_sentence_prob > max_generated_prob:
                    max_generated_prob = new_sentence_prob
                    most_natural_generated_sentence = generated_sentence

            most_natural_revised_sentence_unicode = self.dl.convert2unicode(
                trim(most_natural_generated_sentence))
            sentence_outputs['generated_sentence'].append(
                most_natural_revised_sentence_unicode)
            sentence_outputs['generated_prob'].append(max_generated_prob)
            sentence_outputs['generated_int_arr'].append(
                np.array(most_natural_generated_sentence))

            if len(sentence_outputs['generated_int_arr']) % 100 == 0:
                print(len(sentence_outputs['generated_int_arr']))
                pkl.dump(
                    sentence_outputs,
                    open(
                        "../adversarial_data/%s_%s_natural_sentence_%s.pkl" %
                        (mode, token, self.dataset), 'wb'))

        #order the records in order of maximum probability increase to minimum probability increase
        prob_diff = np.array(sentence_outputs['generated_prob']) - np.array(
            sentence_outputs['original_prob'])
        sorted_idx = np.argsort(prob_diff)[::-1]
        for key in sentence_outputs.keys():
            sentence_outputs[key] = [
                sentence_outputs[key][idx] for idx in sorted_idx
            ]
        sentence_outputs['prob_change'] = np.array(
            sentence_outputs['generated_prob']) - np.array(
                sentence_outputs['original_prob'])
        pd.DataFrame.from_dict(sentence_outputs).to_csv(
            "../showable/%s_%s_natural_sentence_%s.csv" %
            (mode, token, self.dataset),
            index=False)
        pkl.dump(
            sentence_outputs,
            open(
                "../adversarial_data/%s_%s_natural_sentence_%s.pkl" %
                (mode, token, self.dataset), 'wb'))

    def generate_natural_tweets(self, mode, token):
        tweet_dicts = self.dl.data['data']
        self.create_natural_sentences(mode, token, tweet_dicts)

    def evaluate_logistic_regression_prediction(self, mode):
        assert mode in ['score', 'binary']

        lr = Logistic_regr(mode='eval')
        generated_sentences = pkl.load(
            open("../data/insert_a_natural_sentence.pkl", 'rb'))
        original_int_arrs = generated_sentences['original_int_arr']
        generated_int_arrs = generated_sentences['generated_int_arr']

        if mode == 'score':
            original_agg_scores, original_loss_scores = lr.predict(
                original_int_arrs, mode="score")
            generated_agg_scores, generated_loss_scores = lr.predict(
                generated_int_arrs, mode="score")
            return original_agg_scores, original_loss_scores, generated_agg_scores, generated_loss_scores
        else:
            original_agg_labels, original_loss_labels = lr.predict(
                original_int_arrs, mode="binary")
            generated_agg_labels, generated_loss_labels = lr.predict(
                generated_int_arrs, mode="binary")
            new_agg_positive_tweet_ids = []
            for idx in range(len(original_agg_labels)):
                if original_agg_labels[idx] == 0 and generated_agg_labels[
                        idx] == 1:
                    new_agg_positive_tweet_ids.append(
                        generated_sentences['tweet_id'][idx])
            new_loss_positive_tweet_ids = []
            for idx in range(len(original_loss_labels)):
                if original_loss_labels[idx] == 0 and generated_loss_labels[
                        idx] == 1:
                    new_loss_positive_tweet_ids.append(
                        generated_sentences['tweet_id'][idx])
            return new_agg_positive_tweet_ids, new_loss_positive_tweet_ids

    def evaluate_model_prediction(self,
                                  token,
                                  model_id,
                                  run_idx,
                                  fold_idx,
                                  class_idx,
                                  mode='binary',
                                  top_num=800):
        generated_sentences = pkl.load(
            open(
                "../adversarial_data/insert_%s_natural_sentence_labeled.pkl" %
                token, 'rb'))
        original_int_arrs = generated_sentences['original_int_arr'][:top_num]
        revised_int_arrs = generated_sentences['generated_int_arr'][:top_num]
        tweet_ids = generated_sentences['tweet_id'][:top_num]

        all_tweets = self.dl.all_data()
        original_tweets = []
        generated_tweets = []

        tweetid2tweetidx = {}
        for idx in range(len(all_tweets)):
            tweetid2tweetidx[all_tweets[idx]['tweet_id']] = idx

        for idx in range(len(original_int_arrs)):
            tweet = all_tweets[tweetid2tweetidx[tweet_ids[idx]]]
            original_tweets.append(tweet)
            generated_tweet = deepcopy(tweet)
            assert np.all(generated_tweet['word_padded_int_arr'] ==
                          original_int_arrs[idx])
            generated_tweet['word_padded_int_arr'] = revised_int_arrs[idx]
            generated_tweet['word_int_arr'] = trim(
                generated_tweet['word_padded_int_arr'])
            generated_tweets.append(generated_tweet)

        generated_elmo_dir = None
        original_elmo_dir = None
        if model_id in (3, 4, 6, 7):  #DS ELMo
            generated_elmo_dir = "../adversarial_data/DS_ELMo_adversarial_insert_%s" % token
            original_elmo_dir = "../data/DS_ELMo_rep"
        if model_id == 5:  #NonDS ELMo
            generated_elmo_dir = "../adversarial_data/NonDS_ELMo_adversarial_insert_%s" % token
            original_elmo_dir = "../data/NonDS_ELMo_rep"

        load_model_tweet_dicts(model_id,
                               generated_tweets,
                               elmo_dir=generated_elmo_dir)
        generated_tweet_X = pkl.load(
            open("../data/adversarial_tweet_X.pkl", 'rb'))

        load_model_tweet_dicts(model_id,
                               original_tweets,
                               elmo_dir=original_elmo_dir)
        original_tweet_X = pkl.load(
            open("../data/adversarial_tweet_X.pkl", 'rb'))

        model = load_model(model_id, run_idx, fold_idx, class_idx)
        original_predictions = model.predict(original_tweet_X)
        generated_predictions = model.predict(generated_tweet_X)

        assert mode in ['score', 'binary']
        if mode == 'score':  # analyze prediction numerical score change
            return original_predictions, generated_predictions

        else:  # analyze label flipping
            threshold = get_model_info(num_runs=5,
                                       num_folds=5,
                                       num_models=model_id)['thresholds'][(
                                           model_id,
                                           run_idx)][class_idx][fold_idx]
            original_pred_labels = [
                1 if x >= threshold else 0 for x in original_predictions
            ]
            generated_pred_labels = [
                1 if x >= threshold else 0 for x in generated_predictions
            ]
            new_positive_tweet_ids = []
            new_negative_tweet_ids = []

            for idx in range(len(original_predictions)):
                if original_pred_labels[idx] == 0 and generated_pred_labels[
                        idx] == 1:
                    new_positive_tweet_ids.append(
                        original_tweets[idx]['tweet_id'])
                if original_pred_labels[idx] == 1 and generated_pred_labels[
                        idx] == 0:
                    new_negative_tweet_ids.append(
                        original_tweets[idx]['tweet_id'])
            return len(new_positive_tweet_ids)

    def evaluate_all_models(self, token, class_idx):
        results = {}
        for model_id in [1, 2, 18, 19]:
            flipped_counts = []
            for fold_idx in range(5):
                counts = []
                for run_idx in range(5):
                    counts.append(
                        self.evaluate_model_prediction(token, model_id,
                                                       run_idx, fold_idx,
                                                       class_idx))
                flipped_counts.append(sum(counts) / len(counts))
            results[model_id] = sum(flipped_counts) / len(flipped_counts)
        pkl.dump(
            results,
            open(
                "../adversarial_data/insert_%s_model_stats_labeled_121819.pkl"
                % token, 'wb'))
        analysis_dict = {}
        analysis_dict['model_id'] = sorted([x for x in results.keys()])
        analysis_dict['num_flipped_adversarials'] = [
            results[x] for x in analysis_dict['model_id']
        ]
        pd.DataFrame.from_dict(analysis_dict).to_csv(
            "../showable/adversarial_%s_stats_labeled.csv" % token,
            index=False)
示例#23
0
                        type=int,
                        default=5,
                        help='min_count for word2vec; ignored if svd')
    parser.add_argument('-ep',
                        '--epochs',
                        type=int,
                        default=20,
                        help='iterations for word2vec; ignored if svd')

    args = vars(parser.parse_args())
    print(args)

    # main(args)
    option = args['option']
    print('Initializing Data Loader')
    dl = Data_loader(option=option)
    all_data = dl.all_data()
    all_tids = set([str(tweet['tweet_id']) for tweet in all_data])
    print(list(all_tids)[:10])
    print('Len of all data:', len(all_data))
    test_ids = set([tweet['tweet_id'] for tweet in dl.test_data()])
    print('Len of test data:', len(test_ids))
    ensemble_ids = get_ensemble_tids()
    print('Len of ensemble data:', len(ensemble_ids))
    print(list(ensemble_ids)[:10])
    assert (len(ensemble_ids.intersection(all_tids)) == 0)

    # w2v_file = '../data/w2v_word_s300_w5_mc5_ep20.bin'
    # svd_file = '../data/svd_word_s300.pkl'
    # sample_usage(w2v_file, svd_file)
示例#24
0
import torch

torch.manual_seed(args.seed)
use_cuda = torch.cuda.is_available() and not args.unuse_cuda

args.channel_dims = list(map(int, args.channel_dims.split(',')))

if use_cuda:
    torch.cuda.manual_seed(args.seed)

# ##############################################################################
# Load data
################################################################################
from data_loader import Data_loader

real_datas = Data_loader('data/', args.img_size, args.batch_size, use_cuda)

# ##############################################################################
# Build model
# ##############################################################################
import model

G = model.Generator(args.img_size, args.img_size, args.channel_dims, args.z_dim)
D = model.Discriminator(args.img_size, args.img_size, args.channel_dims, args.relu_leak)
if use_cuda:
    G, D = G.cuda(), D.cuda()

optimizer_D = torch.optim.Adam(D.parameters(), lr=args.lr, betas=(args.beta1, 0.999))
optimizer_G = torch.optim.Adam(G.parameters(), lr=args.lr, betas=(args.beta1, 0.999))
criterion = torch.nn.BCELoss()
 def __init__(self, data_size, textures_path, max_region=10):
     self.data_size = data_size
     self.data = Data_loader(textures_path, 1, max_region)
示例#26
0
    vertical_all = []  #vertical
    f = plt.figure('vertical')
    for i in range(0, len(model.weights) - 1):
        vertical = []
        for j in range(0, len(model.weights) - 1):
            dst = np.sum(model.weights[i, j, :] - model.weights[i, j + 1, :])
            vertical.append(dst)
        vertical_all.append(vertical)
    plt.imshow(vertical_all, cmap=plt.cm.gray)
    plt.colorbar()
    f.savefig(graph_name + '_vertical_' + '.png')


## load data

d = Data_loader('dataset.txt')
inputs = d.dataset.T
(dim, count) = inputs.shape

## train model

rows = 30
cols = 30

metric = L_max

top_left = np.array((0, 0))
bottom_right = np.array((rows - 1, cols - 1))
lambda_s = metric(top_left, bottom_right) * 0.5  # there was *0.5
model = SOM(dim, rows, cols, inputs)
model.train(inputs,
示例#27
0
                        batch_size=batch_size,
                        sample=True),
            create_data(input_name2id2np,
                        val,
                        return_generators=return_generators,
                        batch_size=batch_size,
                        sample=False),
            create_data(input_name2id2np,
                        test,
                        return_generators=return_generators,
                        batch_size=batch_size,
                        sample=False))


if __name__ == '__main__':
    from data_loader import Data_loader
    option = 'word'
    max_len = 50
    vocab_size = 40000
    dl = Data_loader(vocab_size=vocab_size, max_len=max_len, option=option)
    fold_idx = 0
    data_fold = dl.cv_data(fold_idx)
    tr, val, test = data_fold
    print(tr[0])
    '''
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = create_clf_data(simplest_tweet2data,
                                                                           data_fold)
    for key in X_train:
        print(X_train[key])
    '''
示例#28
0
if use_cuda:
    torch.cuda.manual_seed(args.seed)

# ##############################################################################
# Load data
################################################################################
from data_loader import Data_loader

data = torch.load(args.data)
args.max_len = data["max_word_len"]
args.dict = data["dict"]
args.vocab_size = data["vocab_size"]

training_data = Data_loader("data/train2017/",
                            data['train']['imgs'],
                            data['train']['captions'],
                            args.max_len,
                            batch_size=args.batch_size,
                            is_cuda=use_cuda)

validation_data = Data_loader("data/val2017/",
                              data['valid']['imgs'],
                              data['valid']['captions'],
                              args.max_len,
                              batch_size=args.batch_size,
                              is_cuda=use_cuda,
                              evaluation=True)

# ##############################################################################
# Build model
# ##############################################################################
import model
示例#29
0
文件: test.py 项目: h-peng17/PIM
if options.mode == 'people':
    test = Test(config, options.ckpt_dir, id2word)
    test.init_test(Model(config), options.ckpt_index)

    print('请输入:')
    line = ''
    is_continue = False
    while line != 'stop':
        line = input()
        pins = line.strip().split()
        query = np.ones([1, config.seq_len], dtype=np.int32)
        target_seq_len = [0]
        target_seq_len[0] = len(pins)
        for i, pin in enumerate(pins):
            if pin not in p2id:
                is_continue = True
                print('Invalid input!')
                break
            query[0][i] = p2id[pin]
        if is_continue:
            is_continue = False
            continue
        test.people_test_one_step(query, target_seq_len)

elif options.mode == 'computer':
    test_data_loader = Data_loader('test', config)
    test = Test(config, options.ckpt_dir, id2word, test_data_loader)
    test.init_test(Model(config))
    test._test()
示例#30
0

def generate_samples(sess, model, inv_charmap):
	samples = sess.run(model.fake_inputs)
	samples = np.argmax(samples, axis=2)
	decoded_samples = []
	for i in range(len(samples)):       # batch_size
		decoded = []
		for j in range(len(samples[i])):        # seq_length
			decoded.append(inv_charmap[samples[i][j]])
		decoded_samples.append(tuple(decoded))
	return decoded_samples


if __name__ == "__main__":
	data_loader = Data_loader(pm.batch_size)

	lines, charmap, inv_charmap = data_loader.load_datasets(
		max_length=pm.seq_length,
		example_num=pm.example_num,
		vocab_size=pm.vocab_size,
		data_path=pm.data_path
	)

	model = WGAN(pm.data_path, pm.batch_size, pm.epochs, pm.vocab_size, pm.seq_length, pm.embed_dims, pm.dis_epochs, pm.example_num, pm.learning_rate, charmap, pm.lamb)

	if len(pm.data_path) == 0:
		raise Exception("Please specify path to data directory in adver_train.py!")

	# model.print_model_settings(locals().copy())