Exemplo n.º 1
0
def get_pair_to_tids():
    print('Initializing Data Loader...')
    dl = Data_loader()
    test_ids = [tweet['tweet_id'] for tweet in dl.test_data()]
    pair2tids = {}
    for record in dl.all_data():
        if record['tweet_id'] not in test_ids:
            involved = set()
            involved.add(record['user_post'])
            if 'user_retweet' in record:
                involved.add(record['user_retweet'])
            if 'user_mentions' in record:
                for user in record['user_mentions']:
                    involved.add(user)
            involved = sorted(list(involved))

            for i, u1 in enumerate(involved):
                for u2 in involved[i + 1:]:
                    pair_id = str(u1) + '_' + str(u2)
                    if pair_id in pair2tids:
                        pair2tids[pair_id].append(record['tweet_id'])
                    else:
                        pair2tids[pair_id] = [record['tweet_id']]

    return pair2tids
Exemplo n.º 2
0
def main(args):
    # params for data loader
    option = args['option']
    print('Initializing Data Loader')
    dl = Data_loader(option=option)
    all_data = dl.all_data()
    print('Len of all data:', len(all_data))
    test_ids = set([tweet['tweet_id'] for tweet in dl.test_data()])
    print('Len of test data:', len(test_ids))
    ensemble_ids = get_ensemble_tids()
    print('Len of ensemble data:', len(ensemble_ids))

    mode = args['mode']
    assert (mode == 'w2v' or mode == 'svd' or mode == 'd2v')
    if mode == 'w2v':
        sentences = []
        for tweet in all_data:
            # need indices split
            if tweet['tweet_id'] not in test_ids and tweet[
                    'tweet_id'] not in ensemble_ids:
                sentences.append([str(x) for x in tweet['int_arr']])
        print('Num sentences:', len(sentences))
        print('Check sentence0:', sentences[0])
        generate_w2v_embs(sentences, option)
    elif mode == 'svd':
        sentences = []
        for i, tweet in enumerate(all_data):
            # need indices joined
            if tweet['tweet_id'] not in test_ids and tweet[
                    'tweet_id'] not in ensemble_ids:
                sentences.append(' '.join([str(x) for x in tweet['int_arr']]))
        print('Num sentences:', len(sentences))
        print('Check sentence0:', sentences[0])
        generate_svd_embs(sentences, option)
    else:  # mode == d2v
        sentences = []
        tags = []
        for tweet in all_data:
            if tweet['tweet_id'] not in test_ids and tweet[
                    'tweet_id'] not in ensemble_ids:
                # need indices split and use id's as tags
                sentences.append([str(x) for x in tweet['int_arr']])
                tags.append([str(tweet['tweet_id'])])
        print('Num sentences:', len(sentences))
        print('Check sentence0:', sentences[0])
        print('Check tag0:', tags[0])
        generate_d2v_embs(sentences, tags, option)
Exemplo n.º 3
0
def add_inputs():
    save_file = 'all_inputs.pkl'
    all_inputs = pickle.load(open(save_file, 'rb'))

    print('Initializing labeled Data Loader...')
    labeled_dl = Data_loader(labeled_only=True)
    labeled_tweets = labeled_dl.all_data()

    # TIME INPUT
    # add_time_input(all_inputs)
    # print('Added time input, shape =', np.array(list(all_inputs['time'].values())).shape)

    # TWEET-LEVEL INPUTS
    # add_tweet_level_input(all_inputs, labeled_tweets, emb_type='splex')
    # print('Added splex_tl input, shape =', np.array(list(all_inputs['splex_tl'].values())).shape)

    # CONTEXT-LEVEL INPUTS
    # emb_to_sizes = {'w2v': [30, 60], 'splex': [2, 30]}
    # add_context_level_inputs(all_inputs, labeled_tweets, emb_to_sizes=emb_to_sizes)
    # print('Added context inputs')
    # print('w2v shape =', np.array(list(all_inputs['30_w2v_cl'].values())).shape)
    # print('splex shape =', np.array(list(all_inputs['2_splex_cl'].values())).shape)

    # USER INPUTS
    # add_user_inputs(all_inputs, labeled_tweets, num_users=300)
    # add_user_inputs(all_inputs, labeled_tweets, num_users=50)
    # print('Added user inputs: 50 users shape =', np.array(list(all_inputs['50_post_user_index'].values())).shape)

    # # PAIRWISE INPUT
    # add_pairwise_input(all_inputs, labeled_tweets, cutoff=1)
    # add_pairwise_input(all_inputs, labeled_tweets, cutoff=2)
    # add_pairwise_input(all_inputs, labeled_tweets, cutoff=3)
    # print('Added pairwise inputs')
    # print('splex shape =', np.array(list(all_inputs['pairwise_c1_splex'].values())).shape)
    # print('w2v shape =', np.array(list(all_inputs['pairwise_c1_w2v'].values())).shape)

    pickle.dump(all_inputs, open(save_file, 'wb'))
    print('Saved', save_file)
Exemplo n.º 4
0
class Adversarial_generator():
    def __init__(self, dataset='labeled'):
        bilm_args = pkl.load(
            open('../experiments/ELMo_weights/4-23-9pm.param', 'rb'))
        bilm_args['experiment_path'] = 'ELMo_weights/4-23-9pm'
        self.bilm = create_bilm_from_args(bilm_args)
        self.dataset = dataset
        if dataset == 'labeled':
            self.dl = Data_loader(labeled_only=True, option='both')
        else:
            self.dl = Data_loader(labeled_only=False, option='both')

    def compute_log_prob(self, sentences_int_arr):
        tokens = self.bilm.dg.transform_sentences(sentences_int_arr)
        loss = self.bilm.compute_loss_on_data(tokens)
        return -loss

    def sanity_check(self):
        # For each two adjacent tweets, switch the word on every positions and see if both tweets' log probability
        # decrease most of the time
        tweet_ids = list(self.dl.data['data'].keys())
        count_prob_decrease = 0  # number of times the revised sentence has lower probability than original sentence
        count_prob_increase = 0  # number of times the revised sentence has higher probability than original sentence
        prob_increase_samples = {}
        prob_increase_samples['original'] = []
        prob_increase_samples['revised'] = []
        prob_increase_samples['original score'] = []
        prob_increase_samples['revised score'] = []

        for idx in range(len(tweet_ids) - 1):
            tweet_id1 = tweet_ids[idx]
            tweet_id2 = tweet_ids[idx + 1]

            sentence1 = trim(
                self.dl.data['data'][tweet_id1]['word_padded_int_arr'])
            sentence2 = trim(
                self.dl.data['data'][tweet_id2]['word_padded_int_arr'])

            log_prob_sentence1 = self.compute_log_prob([sentence1])
            log_prob_sentence2 = self.compute_log_prob([sentence2])
            for word_idx in range(min(len(sentence1), len(sentence2))):
                # swap the two sentences word on this position
                sentence1[word_idx], sentence2[word_idx] = sentence2[
                    word_idx], sentence1[word_idx]
                log_prob_revised_sentence1 = self.compute_log_prob([sentence1])
                log_prob_revised_sentence2 = self.compute_log_prob([sentence2])
                if log_prob_revised_sentence1 <= log_prob_sentence1:
                    count_prob_decrease += 1
                else:
                    count_prob_increase += 1
                    prob_increase_samples['revised'].append(
                        self.dl.convert2unicode(sentence1))
                    prob_increase_samples['revised score'].append(
                        log_prob_revised_sentence1)
                    prob_increase_samples['original score'].append(
                        log_prob_sentence1)

                if log_prob_revised_sentence2 <= log_prob_sentence2:
                    count_prob_decrease += 1
                else:
                    count_prob_increase += 1
                    prob_increase_samples['revised'].append(
                        self.dl.convert2unicode(sentence2))
                    prob_increase_samples['revised score'].append(
                        log_prob_revised_sentence2)
                    prob_increase_samples['original score'].append(
                        log_prob_sentence2)

                # recover the original sentence
                sentence1[word_idx], sentence2[word_idx] = sentence2[
                    word_idx], sentence1[word_idx]
                if log_prob_revised_sentence1 > log_prob_sentence1:
                    prob_increase_samples['original'].append(
                        self.dl.convert2unicode(sentence1))
                if log_prob_revised_sentence2 > log_prob_sentence2:
                    prob_increase_samples['original'].append(
                        self.dl.convert2unicode(sentence2))

            if idx % 10 == 0:
                print("increase: ", count_prob_decrease)
                print("decrease: ", count_prob_increase)
            if idx > 100:
                break
        print("Probability decrease: ", count_prob_decrease)
        print("Probability increase: ", count_prob_increase)
        pd.DataFrame.from_dict(prob_increase_samples).to_csv(
            "../showable/ELMo_sanity_check.csv", index=False)

    def create_natural_sentences(self, mode, token, tweet_dicts):
        assert mode in ['insert', 'replace']
        token_id = self.dl.token2property[token.encode("utf-8")]['id']
        sentence_outputs = {}
        keys = [
            'original_sentence', 'generated_sentence', 'original_prob',
            'generated_prob', 'original_int_arr', 'generated_int_arr',
            'tweet_id'
        ]
        for key in keys:
            sentence_outputs[key] = []

        for tweet_id in tweet_dicts.keys():
            sentence = tweet_dicts[tweet_id]['word_padded_int_arr']
            num_words = sum([x != 0 for x in sentence])

            if mode == 'insert':
                if num_words == 50:  #already max length, cannot add more words
                    continue
                idx_range = range(num_words + 1)
            else:
                idx_range = range(num_words)

            sentence_outputs['original_int_arr'].append(np.array(sentence))
            original_sentence_unicode = self.dl.convert2unicode(trim(sentence))
            sentence_outputs['original_sentence'].append(
                original_sentence_unicode)
            original_sentence_prob = self.compute_log_prob([trim(sentence)])
            sentence_outputs['original_prob'].append(original_sentence_prob)
            sentence_outputs['tweet_id'].append(tweet_id)

            max_generated_prob = -np.inf
            most_natural_generated_sentence = None

            for pos in idx_range:
                if mode == 'insert':
                    generated_sentence = insert_element(
                        sentence, pos, token_id)
                else:
                    generated_sentence = np.array(sentence)
                    generated_sentence[pos] = token_id

                new_sentence_prob = self.compute_log_prob(
                    [trim(generated_sentence)])
                if new_sentence_prob > max_generated_prob:
                    max_generated_prob = new_sentence_prob
                    most_natural_generated_sentence = generated_sentence

            most_natural_revised_sentence_unicode = self.dl.convert2unicode(
                trim(most_natural_generated_sentence))
            sentence_outputs['generated_sentence'].append(
                most_natural_revised_sentence_unicode)
            sentence_outputs['generated_prob'].append(max_generated_prob)
            sentence_outputs['generated_int_arr'].append(
                np.array(most_natural_generated_sentence))

            if len(sentence_outputs['generated_int_arr']) % 100 == 0:
                print(len(sentence_outputs['generated_int_arr']))
                pkl.dump(
                    sentence_outputs,
                    open(
                        "../adversarial_data/%s_%s_natural_sentence_%s.pkl" %
                        (mode, token, self.dataset), 'wb'))

        #order the records in order of maximum probability increase to minimum probability increase
        prob_diff = np.array(sentence_outputs['generated_prob']) - np.array(
            sentence_outputs['original_prob'])
        sorted_idx = np.argsort(prob_diff)[::-1]
        for key in sentence_outputs.keys():
            sentence_outputs[key] = [
                sentence_outputs[key][idx] for idx in sorted_idx
            ]
        sentence_outputs['prob_change'] = np.array(
            sentence_outputs['generated_prob']) - np.array(
                sentence_outputs['original_prob'])
        pd.DataFrame.from_dict(sentence_outputs).to_csv(
            "../showable/%s_%s_natural_sentence_%s.csv" %
            (mode, token, self.dataset),
            index=False)
        pkl.dump(
            sentence_outputs,
            open(
                "../adversarial_data/%s_%s_natural_sentence_%s.pkl" %
                (mode, token, self.dataset), 'wb'))

    def generate_natural_tweets(self, mode, token):
        tweet_dicts = self.dl.data['data']
        self.create_natural_sentences(mode, token, tweet_dicts)

    def evaluate_logistic_regression_prediction(self, mode):
        assert mode in ['score', 'binary']

        lr = Logistic_regr(mode='eval')
        generated_sentences = pkl.load(
            open("../data/insert_a_natural_sentence.pkl", 'rb'))
        original_int_arrs = generated_sentences['original_int_arr']
        generated_int_arrs = generated_sentences['generated_int_arr']

        if mode == 'score':
            original_agg_scores, original_loss_scores = lr.predict(
                original_int_arrs, mode="score")
            generated_agg_scores, generated_loss_scores = lr.predict(
                generated_int_arrs, mode="score")
            return original_agg_scores, original_loss_scores, generated_agg_scores, generated_loss_scores
        else:
            original_agg_labels, original_loss_labels = lr.predict(
                original_int_arrs, mode="binary")
            generated_agg_labels, generated_loss_labels = lr.predict(
                generated_int_arrs, mode="binary")
            new_agg_positive_tweet_ids = []
            for idx in range(len(original_agg_labels)):
                if original_agg_labels[idx] == 0 and generated_agg_labels[
                        idx] == 1:
                    new_agg_positive_tweet_ids.append(
                        generated_sentences['tweet_id'][idx])
            new_loss_positive_tweet_ids = []
            for idx in range(len(original_loss_labels)):
                if original_loss_labels[idx] == 0 and generated_loss_labels[
                        idx] == 1:
                    new_loss_positive_tweet_ids.append(
                        generated_sentences['tweet_id'][idx])
            return new_agg_positive_tweet_ids, new_loss_positive_tweet_ids

    def evaluate_model_prediction(self,
                                  token,
                                  model_id,
                                  run_idx,
                                  fold_idx,
                                  class_idx,
                                  mode='binary',
                                  top_num=800):
        generated_sentences = pkl.load(
            open(
                "../adversarial_data/insert_%s_natural_sentence_labeled.pkl" %
                token, 'rb'))
        original_int_arrs = generated_sentences['original_int_arr'][:top_num]
        revised_int_arrs = generated_sentences['generated_int_arr'][:top_num]
        tweet_ids = generated_sentences['tweet_id'][:top_num]

        all_tweets = self.dl.all_data()
        original_tweets = []
        generated_tweets = []

        tweetid2tweetidx = {}
        for idx in range(len(all_tweets)):
            tweetid2tweetidx[all_tweets[idx]['tweet_id']] = idx

        for idx in range(len(original_int_arrs)):
            tweet = all_tweets[tweetid2tweetidx[tweet_ids[idx]]]
            original_tweets.append(tweet)
            generated_tweet = deepcopy(tweet)
            assert np.all(generated_tweet['word_padded_int_arr'] ==
                          original_int_arrs[idx])
            generated_tweet['word_padded_int_arr'] = revised_int_arrs[idx]
            generated_tweet['word_int_arr'] = trim(
                generated_tweet['word_padded_int_arr'])
            generated_tweets.append(generated_tweet)

        generated_elmo_dir = None
        original_elmo_dir = None
        if model_id in (3, 4, 6, 7):  #DS ELMo
            generated_elmo_dir = "../adversarial_data/DS_ELMo_adversarial_insert_%s" % token
            original_elmo_dir = "../data/DS_ELMo_rep"
        if model_id == 5:  #NonDS ELMo
            generated_elmo_dir = "../adversarial_data/NonDS_ELMo_adversarial_insert_%s" % token
            original_elmo_dir = "../data/NonDS_ELMo_rep"

        load_model_tweet_dicts(model_id,
                               generated_tweets,
                               elmo_dir=generated_elmo_dir)
        generated_tweet_X = pkl.load(
            open("../data/adversarial_tweet_X.pkl", 'rb'))

        load_model_tweet_dicts(model_id,
                               original_tweets,
                               elmo_dir=original_elmo_dir)
        original_tweet_X = pkl.load(
            open("../data/adversarial_tweet_X.pkl", 'rb'))

        model = load_model(model_id, run_idx, fold_idx, class_idx)
        original_predictions = model.predict(original_tweet_X)
        generated_predictions = model.predict(generated_tweet_X)

        assert mode in ['score', 'binary']
        if mode == 'score':  # analyze prediction numerical score change
            return original_predictions, generated_predictions

        else:  # analyze label flipping
            threshold = get_model_info(num_runs=5,
                                       num_folds=5,
                                       num_models=model_id)['thresholds'][(
                                           model_id,
                                           run_idx)][class_idx][fold_idx]
            original_pred_labels = [
                1 if x >= threshold else 0 for x in original_predictions
            ]
            generated_pred_labels = [
                1 if x >= threshold else 0 for x in generated_predictions
            ]
            new_positive_tweet_ids = []
            new_negative_tweet_ids = []

            for idx in range(len(original_predictions)):
                if original_pred_labels[idx] == 0 and generated_pred_labels[
                        idx] == 1:
                    new_positive_tweet_ids.append(
                        original_tweets[idx]['tweet_id'])
                if original_pred_labels[idx] == 1 and generated_pred_labels[
                        idx] == 0:
                    new_negative_tweet_ids.append(
                        original_tweets[idx]['tweet_id'])
            return len(new_positive_tweet_ids)

    def evaluate_all_models(self, token, class_idx):
        results = {}
        for model_id in [1, 2, 18, 19]:
            flipped_counts = []
            for fold_idx in range(5):
                counts = []
                for run_idx in range(5):
                    counts.append(
                        self.evaluate_model_prediction(token, model_id,
                                                       run_idx, fold_idx,
                                                       class_idx))
                flipped_counts.append(sum(counts) / len(counts))
            results[model_id] = sum(flipped_counts) / len(flipped_counts)
        pkl.dump(
            results,
            open(
                "../adversarial_data/insert_%s_model_stats_labeled_121819.pkl"
                % token, 'wb'))
        analysis_dict = {}
        analysis_dict['model_id'] = sorted([x for x in results.keys()])
        analysis_dict['num_flipped_adversarials'] = [
            results[x] for x in analysis_dict['model_id']
        ]
        pd.DataFrame.from_dict(analysis_dict).to_csv(
            "../showable/adversarial_%s_stats_labeled.csv" % token,
            index=False)
Exemplo n.º 5
0
                        default=5,
                        help='min_count for word2vec; ignored if svd')
    parser.add_argument('-ep',
                        '--epochs',
                        type=int,
                        default=20,
                        help='iterations for word2vec; ignored if svd')

    args = vars(parser.parse_args())
    print(args)

    # main(args)
    option = args['option']
    print('Initializing Data Loader')
    dl = Data_loader(option=option)
    all_data = dl.all_data()
    all_tids = set([str(tweet['tweet_id']) for tweet in all_data])
    print(list(all_tids)[:10])
    print('Len of all data:', len(all_data))
    test_ids = set([tweet['tweet_id'] for tweet in dl.test_data()])
    print('Len of test data:', len(test_ids))
    ensemble_ids = get_ensemble_tids()
    print('Len of ensemble data:', len(ensemble_ids))
    print(list(ensemble_ids)[:10])
    assert (len(ensemble_ids.intersection(all_tids)) == 0)

    # w2v_file = '../data/w2v_word_s300_w5_mc5_ep20.bin'
    # svd_file = '../data/svd_word_s300.pkl'
    # sample_usage(w2v_file, svd_file)

    # test_sents = [['2', '254', '440', '192', '94', '57', '72', '77'],
Exemplo n.º 6
0
        Contextifier.SELF, Contextifier.RETWEET, Contextifier.MENTION,
        Contextifier.RETWEET_MENTION
    ]
    context_size = 2
    context_hl_ratio = 0.5
    context_combine = 'avg'
    tl_combine = 'sum'

    print('Initializing Contextifier...')
    contextifier = Contextifier(tweet_level, post_types, context_size,
                                context_hl_ratio, context_combine, tl_combine)

    print('Loading Data...')
    option = 'word'
    max_len = 53
    vocab_size = 30000
    dl = Data_loader(vocab_size=vocab_size, max_len=max_len, option=option)
    dl.all_data()

    print('Creating contexts...')
    context = contextifier.assemble_context(dl.all_data())
    contextifier.set_context(*context)

    # Only necessary if you want to write them all to a file.
    # Can be done "on-demand" with .get_context_embedding()

    print('Writing context embeddings...')
    contextifier.write_context_embeddings(dl)

    # Alternatively, to load from a file, do:
    # contextifier.from_file('../data/'context_emb_5_avg_rtFalse_menTrue_rtmenFalse_hl1.0_.csv')