def find_gun(idx): dl = Data_loader(labeled_only=True) if idx is None: for idx in range(100, 200): print(idx, dl.convert2unicode([idx])) else: print(idx, dl.convert2unicode([idx]))
def make_word_emb_for_nn(extension): size = 300 window = 5 min_count = 5 epochs = 20 w2v_file = '../data/{0}_w2v_s{1}_w{2}_mc{3}_ep{4}.bin'.format( extension, size, window, min_count, epochs) wv = KeyedVectors.load_word2vec_format(w2v_file, binary=True) print('Number of embeddings in {}: {}'.format(w2v_file, len(wv.vocab))) unicode2idx_pkl = 'unicode2idx_' + extension + '.pkl' unicode2idx = pickle.load(open(unicode2idx_pkl, 'rb')) # complete vocab print('Size of complete vocab:', len(unicode2idx)) dl = Data_loader(labeled_only=True) vocab_size = 40000 dim = 300 embeds = np.zeros((vocab_size, dim), dtype=np.float) embeds[1] = np.random.uniform(-0.25, 0.25, dim) not_in_vocab = 0 not_in_w2v = 0 unknown_idx = set() avg_vocab = np.zeros(dim) known_vocab = 0 for dl_idx in range(2, vocab_size): unicode = dl.convert2unicode([dl_idx]).encode('utf-8') if unicode in unicode2idx: ext_idx = unicode2idx[unicode] if str(ext_idx) in wv.vocab: known_vocab += 1 embeds[dl_idx] = wv[str(ext_idx)] avg_vocab += wv[str(ext_idx)] else: #this word is in the training corpus of the pretrained embedding but is thrown away #because its frequency does not reach min_count = 5 not_in_w2v += 1 unknown_idx.add(dl_idx) #embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim) else: #this word is not even in the training corpus of the pretrained embedding not_in_vocab += 1 unknown_idx.add(dl_idx) #embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim) #assign unknown vocabs to average of known vocabs avg_vocab /= known_vocab for unk_idx in unknown_idx: embeds[unk_idx] = avg_vocab print(not_in_vocab, 'not in vocab') print(not_in_w2v, 'not in word2vec (min_count=5)') missed = not_in_vocab + not_in_w2v print('Total: got {} embeddings, missed {}, out of {}'.format( vocab_size - missed, missed, vocab_size)) save_file = 'word_emb_' + extension + '.np' np.savetxt(save_file, embeds) #embeds is final embedding by idx print('Saved embeddings in', save_file)
def make_word_emb_for_nn(extension): size = 300 window = 5 min_count = 5 epochs = 20 w2v_file = '../data/{0}_w2v_s{1}_w{2}_mc{3}_ep{4}.bin'.format( extension, size, window, min_count, epochs) wv = KeyedVectors.load_word2vec_format(w2v_file, binary=True) print('Number of embeddings in {}: {}'.format(w2v_file, len(wv.vocab))) unicode2idx_pkl = 'unicode2idx_' + extension + '.pkl' unicode2idx = pickle.load(open(unicode2idx_pkl, 'rb')) # complete vocab print('Size of complete vocab:', len(unicode2idx)) dl = Data_loader(labeled_only=True) vocab_size = 40000 dim = 300 embeds = np.zeros((vocab_size, dim), dtype=np.float) embeds[1] = np.random.uniform(-0.25, 0.25, dim) not_in_vocab = 0 not_in_w2v = 0 for dl_idx in range(2, vocab_size): unicode = dl.convert2unicode([dl_idx]).encode('utf-8') if unicode in unicode2idx: ext_idx = unicode2idx[unicode] if str(ext_idx) in wv.vocab: embeds[dl_idx] = wv[str(ext_idx)] else: not_in_w2v += 1 embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim) else: not_in_vocab += 1 embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim) print(not_in_vocab, 'not in vocab') print(not_in_w2v, 'not in word2vec (min_count=5)') missed = not_in_vocab + not_in_w2v print('Total: got {} embeddings, missed {}, out of {}'.format( vocab_size - missed, missed, vocab_size)) save_file = 'word_emb_' + extension + '.np' np.savetxt(save_file, embeds) print('Saved embeddings in', save_file)
def check_splex_top_k(mode, k=100, print_top=True): assert(mode == 'loss' or mode == 'agg' or mode == 'sub') splex = pickle.load(open('../data/splex_minmax_svd_word_s300_seeds_hc.pkl', 'rb')) if mode == 'loss': mode_idx = 0 elif mode == 'agg': mode_idx = 1 else: mode_idx = 2 tuples = [(k, splex[k][mode_idx]) for k in splex] tuples = sorted(tuples, key=lambda x: x[1], reverse=True) if print_top: dl = Data_loader(labeled_only=True) row_format = '{:<7}' * 2 + '{:<15}' * 2 print(row_format.format('Rank', 'Index', 'Unicode', 'SPLex {} Score (minmax scaling)'.format(mode.capitalize()))) for rank, (idx, score) in enumerate(tuples[:k]): print(row_format.format(rank, idx, dl.convert2unicode([int(idx)]), round(score, 5))) return tuples[:k]
class Adversarial_generator(): def __init__(self, dataset='labeled'): bilm_args = pkl.load( open('../experiments/ELMo_weights/4-23-9pm.param', 'rb')) bilm_args['experiment_path'] = 'ELMo_weights/4-23-9pm' self.bilm = create_bilm_from_args(bilm_args) self.dataset = dataset if dataset == 'labeled': self.dl = Data_loader(labeled_only=True, option='both') else: self.dl = Data_loader(labeled_only=False, option='both') def compute_log_prob(self, sentences_int_arr): tokens = self.bilm.dg.transform_sentences(sentences_int_arr) loss = self.bilm.compute_loss_on_data(tokens) return -loss def sanity_check(self): # For each two adjacent tweets, switch the word on every positions and see if both tweets' log probability # decrease most of the time tweet_ids = list(self.dl.data['data'].keys()) count_prob_decrease = 0 # number of times the revised sentence has lower probability than original sentence count_prob_increase = 0 # number of times the revised sentence has higher probability than original sentence prob_increase_samples = {} prob_increase_samples['original'] = [] prob_increase_samples['revised'] = [] prob_increase_samples['original score'] = [] prob_increase_samples['revised score'] = [] for idx in range(len(tweet_ids) - 1): tweet_id1 = tweet_ids[idx] tweet_id2 = tweet_ids[idx + 1] sentence1 = trim( self.dl.data['data'][tweet_id1]['word_padded_int_arr']) sentence2 = trim( self.dl.data['data'][tweet_id2]['word_padded_int_arr']) log_prob_sentence1 = self.compute_log_prob([sentence1]) log_prob_sentence2 = self.compute_log_prob([sentence2]) for word_idx in range(min(len(sentence1), len(sentence2))): # swap the two sentences word on this position sentence1[word_idx], sentence2[word_idx] = sentence2[ word_idx], sentence1[word_idx] log_prob_revised_sentence1 = self.compute_log_prob([sentence1]) log_prob_revised_sentence2 = self.compute_log_prob([sentence2]) if log_prob_revised_sentence1 <= log_prob_sentence1: count_prob_decrease += 1 else: count_prob_increase += 1 prob_increase_samples['revised'].append( self.dl.convert2unicode(sentence1)) prob_increase_samples['revised score'].append( log_prob_revised_sentence1) prob_increase_samples['original score'].append( log_prob_sentence1) if log_prob_revised_sentence2 <= log_prob_sentence2: count_prob_decrease += 1 else: count_prob_increase += 1 prob_increase_samples['revised'].append( self.dl.convert2unicode(sentence2)) prob_increase_samples['revised score'].append( log_prob_revised_sentence2) prob_increase_samples['original score'].append( log_prob_sentence2) # recover the original sentence sentence1[word_idx], sentence2[word_idx] = sentence2[ word_idx], sentence1[word_idx] if log_prob_revised_sentence1 > log_prob_sentence1: prob_increase_samples['original'].append( self.dl.convert2unicode(sentence1)) if log_prob_revised_sentence2 > log_prob_sentence2: prob_increase_samples['original'].append( self.dl.convert2unicode(sentence2)) if idx % 10 == 0: print("increase: ", count_prob_decrease) print("decrease: ", count_prob_increase) if idx > 100: break print("Probability decrease: ", count_prob_decrease) print("Probability increase: ", count_prob_increase) pd.DataFrame.from_dict(prob_increase_samples).to_csv( "../showable/ELMo_sanity_check.csv", index=False) def create_natural_sentences(self, mode, token, tweet_dicts): assert mode in ['insert', 'replace'] token_id = self.dl.token2property[token.encode("utf-8")]['id'] sentence_outputs = {} keys = [ 'original_sentence', 'generated_sentence', 'original_prob', 'generated_prob', 'original_int_arr', 'generated_int_arr', 'tweet_id' ] for key in keys: sentence_outputs[key] = [] for tweet_id in tweet_dicts.keys(): sentence = tweet_dicts[tweet_id]['word_padded_int_arr'] num_words = sum([x != 0 for x in sentence]) if mode == 'insert': if num_words == 50: #already max length, cannot add more words continue idx_range = range(num_words + 1) else: idx_range = range(num_words) sentence_outputs['original_int_arr'].append(np.array(sentence)) original_sentence_unicode = self.dl.convert2unicode(trim(sentence)) sentence_outputs['original_sentence'].append( original_sentence_unicode) original_sentence_prob = self.compute_log_prob([trim(sentence)]) sentence_outputs['original_prob'].append(original_sentence_prob) sentence_outputs['tweet_id'].append(tweet_id) max_generated_prob = -np.inf most_natural_generated_sentence = None for pos in idx_range: if mode == 'insert': generated_sentence = insert_element( sentence, pos, token_id) else: generated_sentence = np.array(sentence) generated_sentence[pos] = token_id new_sentence_prob = self.compute_log_prob( [trim(generated_sentence)]) if new_sentence_prob > max_generated_prob: max_generated_prob = new_sentence_prob most_natural_generated_sentence = generated_sentence most_natural_revised_sentence_unicode = self.dl.convert2unicode( trim(most_natural_generated_sentence)) sentence_outputs['generated_sentence'].append( most_natural_revised_sentence_unicode) sentence_outputs['generated_prob'].append(max_generated_prob) sentence_outputs['generated_int_arr'].append( np.array(most_natural_generated_sentence)) if len(sentence_outputs['generated_int_arr']) % 100 == 0: print(len(sentence_outputs['generated_int_arr'])) pkl.dump( sentence_outputs, open( "../adversarial_data/%s_%s_natural_sentence_%s.pkl" % (mode, token, self.dataset), 'wb')) #order the records in order of maximum probability increase to minimum probability increase prob_diff = np.array(sentence_outputs['generated_prob']) - np.array( sentence_outputs['original_prob']) sorted_idx = np.argsort(prob_diff)[::-1] for key in sentence_outputs.keys(): sentence_outputs[key] = [ sentence_outputs[key][idx] for idx in sorted_idx ] sentence_outputs['prob_change'] = np.array( sentence_outputs['generated_prob']) - np.array( sentence_outputs['original_prob']) pd.DataFrame.from_dict(sentence_outputs).to_csv( "../showable/%s_%s_natural_sentence_%s.csv" % (mode, token, self.dataset), index=False) pkl.dump( sentence_outputs, open( "../adversarial_data/%s_%s_natural_sentence_%s.pkl" % (mode, token, self.dataset), 'wb')) def generate_natural_tweets(self, mode, token): tweet_dicts = self.dl.data['data'] self.create_natural_sentences(mode, token, tweet_dicts) def evaluate_logistic_regression_prediction(self, mode): assert mode in ['score', 'binary'] lr = Logistic_regr(mode='eval') generated_sentences = pkl.load( open("../data/insert_a_natural_sentence.pkl", 'rb')) original_int_arrs = generated_sentences['original_int_arr'] generated_int_arrs = generated_sentences['generated_int_arr'] if mode == 'score': original_agg_scores, original_loss_scores = lr.predict( original_int_arrs, mode="score") generated_agg_scores, generated_loss_scores = lr.predict( generated_int_arrs, mode="score") return original_agg_scores, original_loss_scores, generated_agg_scores, generated_loss_scores else: original_agg_labels, original_loss_labels = lr.predict( original_int_arrs, mode="binary") generated_agg_labels, generated_loss_labels = lr.predict( generated_int_arrs, mode="binary") new_agg_positive_tweet_ids = [] for idx in range(len(original_agg_labels)): if original_agg_labels[idx] == 0 and generated_agg_labels[ idx] == 1: new_agg_positive_tweet_ids.append( generated_sentences['tweet_id'][idx]) new_loss_positive_tweet_ids = [] for idx in range(len(original_loss_labels)): if original_loss_labels[idx] == 0 and generated_loss_labels[ idx] == 1: new_loss_positive_tweet_ids.append( generated_sentences['tweet_id'][idx]) return new_agg_positive_tweet_ids, new_loss_positive_tweet_ids def evaluate_model_prediction(self, token, model_id, run_idx, fold_idx, class_idx, mode='binary', top_num=800): generated_sentences = pkl.load( open( "../adversarial_data/insert_%s_natural_sentence_labeled.pkl" % token, 'rb')) original_int_arrs = generated_sentences['original_int_arr'][:top_num] revised_int_arrs = generated_sentences['generated_int_arr'][:top_num] tweet_ids = generated_sentences['tweet_id'][:top_num] all_tweets = self.dl.all_data() original_tweets = [] generated_tweets = [] tweetid2tweetidx = {} for idx in range(len(all_tweets)): tweetid2tweetidx[all_tweets[idx]['tweet_id']] = idx for idx in range(len(original_int_arrs)): tweet = all_tweets[tweetid2tweetidx[tweet_ids[idx]]] original_tweets.append(tweet) generated_tweet = deepcopy(tweet) assert np.all(generated_tweet['word_padded_int_arr'] == original_int_arrs[idx]) generated_tweet['word_padded_int_arr'] = revised_int_arrs[idx] generated_tweet['word_int_arr'] = trim( generated_tweet['word_padded_int_arr']) generated_tweets.append(generated_tweet) generated_elmo_dir = None original_elmo_dir = None if model_id in (3, 4, 6, 7): #DS ELMo generated_elmo_dir = "../adversarial_data/DS_ELMo_adversarial_insert_%s" % token original_elmo_dir = "../data/DS_ELMo_rep" if model_id == 5: #NonDS ELMo generated_elmo_dir = "../adversarial_data/NonDS_ELMo_adversarial_insert_%s" % token original_elmo_dir = "../data/NonDS_ELMo_rep" load_model_tweet_dicts(model_id, generated_tweets, elmo_dir=generated_elmo_dir) generated_tweet_X = pkl.load( open("../data/adversarial_tweet_X.pkl", 'rb')) load_model_tweet_dicts(model_id, original_tweets, elmo_dir=original_elmo_dir) original_tweet_X = pkl.load( open("../data/adversarial_tweet_X.pkl", 'rb')) model = load_model(model_id, run_idx, fold_idx, class_idx) original_predictions = model.predict(original_tweet_X) generated_predictions = model.predict(generated_tweet_X) assert mode in ['score', 'binary'] if mode == 'score': # analyze prediction numerical score change return original_predictions, generated_predictions else: # analyze label flipping threshold = get_model_info(num_runs=5, num_folds=5, num_models=model_id)['thresholds'][( model_id, run_idx)][class_idx][fold_idx] original_pred_labels = [ 1 if x >= threshold else 0 for x in original_predictions ] generated_pred_labels = [ 1 if x >= threshold else 0 for x in generated_predictions ] new_positive_tweet_ids = [] new_negative_tweet_ids = [] for idx in range(len(original_predictions)): if original_pred_labels[idx] == 0 and generated_pred_labels[ idx] == 1: new_positive_tweet_ids.append( original_tweets[idx]['tweet_id']) if original_pred_labels[idx] == 1 and generated_pred_labels[ idx] == 0: new_negative_tweet_ids.append( original_tweets[idx]['tweet_id']) return len(new_positive_tweet_ids) def evaluate_all_models(self, token, class_idx): results = {} for model_id in [1, 2, 18, 19]: flipped_counts = [] for fold_idx in range(5): counts = [] for run_idx in range(5): counts.append( self.evaluate_model_prediction(token, model_id, run_idx, fold_idx, class_idx)) flipped_counts.append(sum(counts) / len(counts)) results[model_id] = sum(flipped_counts) / len(flipped_counts) pkl.dump( results, open( "../adversarial_data/insert_%s_model_stats_labeled_121819.pkl" % token, 'wb')) analysis_dict = {} analysis_dict['model_id'] = sorted([x for x in results.keys()]) analysis_dict['num_flipped_adversarials'] = [ results[x] for x in analysis_dict['model_id'] ] pd.DataFrame.from_dict(analysis_dict).to_csv( "../showable/adversarial_%s_stats_labeled.csv" % token, index=False)
class LIME: def __init__(self, model_predict, model_threshold, output_dir, input_format, tweet_records, truth_label, pad_elmo=False, unigram_observe_ids=None): #model_predict is a function the takes X and evaluates the score, abstracted to keep LIME decoupled from model #architecture, input format and use of context features. self.dl = Data_loader(labeled_only=True, option='both') self.model_predict = model_predict self.model_threshold = model_threshold self.output_dir = output_dir self.input_format = input_format self.pad_elmo = pad_elmo self.unigram_observe_ids = unigram_observe_ids self.tweet_records, self.truth_label = tweet_records, truth_label self.scores = self.model_predict(self.tweet_records).flatten() self.label_prediction = [ 1 if self.scores[idx] >= self.model_threshold else 0 for idx in range(len(self.scores)) ] idx_considered = [ idx for idx in range(len(self.label_prediction)) if self.label_prediction[idx] == 1 ] self.tweet_id_considered = [ self.tweet_records['tweet_id'][idx] for idx in idx_considered ] included_tweet_records = {} for key in self.tweet_records.keys(): if key == 'word_content_input_elmo' and pad_elmo is False: included_tweet_records[key] = [ self.tweet_records[key][idx] for idx in idx_considered ] else: included_tweet_records[key] = np.array( [self.tweet_records[key][idx] for idx in idx_considered]) self.tweet_records = included_tweet_records self.scores = np.array([self.scores[idx] for idx in idx_considered]) # tweet_dict is a map from keys to numpy arrays # one of the keys is "tweet_id" s.t. it can be mapped back to the original tweet id def create_perturbation_samples(self, tweet_dict, elmo_masked_idx=None): # perturbed_tests is a perturbed_tests = dict([(key, []) for key in tweet_dict]) p_test_idx = 0 # (tweet_id, word_idx, 'uni'/'bi') mapped to index in the test batch tweet_idx_word_idx2idx, idx2sent_length = {}, {} for idx in range(len(tweet_dict['tweet_id'])): content_input = tweet_dict['word_content_input'][idx] sentence_length = sum([1 if w != 0 else 0 for w in content_input]) idx2sent_length[idx] = sentence_length # mask each unigram for word_idx in range(sentence_length): tweet_idx_word_idx2idx[(idx, word_idx, 'uni')] = p_test_idx p_test_idx += 1 # prepare corresponding input for each key for key in perturbed_tests: if key != 'word_content_input' and key != 'word_content_input_elmo': perturbed_tests[key].append(tweet_dict[key][idx]) elif key == 'word_content_input': perturbed_content_input = np.array( tweet_dict[key][idx]) perturbed_content_input[word_idx] = 1 perturbed_tests[key].append(perturbed_content_input) else: #key = 'word_content_input_elmo' if elmo_masked_idx is None: masked_idx = (word_idx) else: if word_idx == elmo_masked_idx[idx]: masked_idx = (word_idx) else: masked_idx = tuple( sorted((word_idx, elmo_masked_idx[idx]))) tweet_id = tweet_dict['tweet_id'][idx] data = pkl.load( open("../data/DS_ELMo_rep_all/%d.pkl" % tweet_id, 'rb')) elmo_masked = data[masked_idx] if self.pad_elmo: # if cnn needs to pad to max_len to keep shape of all inputs the same elmo_masked = pad_elmo_representation(elmo_masked) perturbed_tests[key].append(elmo_masked) for key in perturbed_tests: if key != 'word_content_input_elmo' or self.pad_elmo is True: perturbed_tests[key] = np.array(perturbed_tests[key]) return tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length def analyze_perturbations_influence(self, tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length, round, observe_word_position_idx=None, observe_word_ids=None): """ For first round, if observe_word_id is not None, will keep track of the rank of the unigram specified by observe_word_id (for example 9 corresponds with "a") in the sorted order of LIME influence from most influential to least influential. For second round, if observe_word_position_idx is not None, then keep track of the rank of the word in the tweet at position specified by observe_word_position_idx in the sorted order of LIME influence for consistency check. """ if round == 1: self.scores = self.model_predict(self.tweet_records).flatten() first_round_unigram_ranking = {} for observe_word_id in observe_word_ids: first_round_unigram_ranking[observe_word_id] = [] elif round == 2: self.scores = self.model_predict( self.masked_tweet_records).flatten() second_round_ranking = [] preturbed_preds = self.model_predict(perturbed_tests).flatten() idx2max_min_wordidx = {} max_influences = [] all_influences = [] for idx in range(len(idx2sent_length)): #unigram influence analysis influences = [] for word_idx in range(idx2sent_length[idx]): p_test_idx = tweet_idx_word_idx2idx[(idx, word_idx, 'uni')] influence = self.scores[idx] - preturbed_preds[p_test_idx] influences.append(influence) influences = np.array(influences) all_influences.append(influences) if round == 1 and observe_word_ids is not None: tweet_int_arr = self.tweet_records['word_content_input'][idx] arg_sort = np.argsort(influences)[::-1] for observe_word_id in observe_word_ids: unigram_in_tweet = False for i in range(len(arg_sort)): if tweet_int_arr[arg_sort[i]] == observe_word_id: first_round_unigram_ranking[ observe_word_id].append(i) unigram_in_tweet = True break if unigram_in_tweet is False: first_round_unigram_ranking[observe_word_id].append(-1) if round == 2: arg_sort = np.argsort(influences)[::-1] assert observe_word_position_idx[idx] in arg_sort for rank_idx in range(idx2sent_length[idx]): if arg_sort[rank_idx] == observe_word_position_idx[idx]: second_round_ranking.append(rank_idx) max_influence_word_idx = np.argmax(influences) min_influence_word_idx = np.argmin(np.abs(influences)) max_influences.append(max(influences)) idx2max_min_wordidx[idx] = (idx2sent_length[idx], max_influence_word_idx, min_influence_word_idx) if round == 1: return idx2max_min_wordidx, first_round_unigram_ranking, max_influences, all_influences elif round == 2: return idx2max_min_wordidx, second_round_ranking, max_influences, all_influences def lime(self): tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length = self.create_perturbation_samples( self.tweet_records) (idx2max_min_wordidx, first_round_unigram_ranking, first_round_max_influences, first_round_all_influences) \ = self.analyze_perturbations_influence(tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length, round=1, observe_word_ids=self.unigram_observe_ids) self.masked_tweet_records = {} for key in self.tweet_records.keys(): if key != 'word_content_input_elmo' or self.pad_elmo is True: self.masked_tweet_records[key] = np.array( self.tweet_records[key]) else: self.masked_tweet_records[key] = self.tweet_records[key] for idx in range(len(idx2max_min_wordidx)): self.masked_tweet_records['word_content_input'][idx][ idx2max_min_wordidx[idx][2]] = 1 #mask insignificant unigram if self.input_format == 'discrete': tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length = self.create_perturbation_samples( self.masked_tweet_records) else: elmo_masked_wordidx = [ idx2max_min_wordidx[idx][2] for idx in range(len(idx2max_min_wordidx)) ] tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length = self.create_perturbation_samples( self.masked_tweet_records, elmo_masked_idx=elmo_masked_wordidx) observe_word_idx = {} for idx in range(len(idx2max_min_wordidx)): observe_word_idx[idx] = idx2max_min_wordidx[idx][1] second_round_idx2max_min_wordidx, second_round_ranking, second_round_max_influences, second_round_all_influences = \ self.analyze_perturbations_influence(tweet_idx_word_idx2idx, perturbed_tests, idx2sent_length, round=2, observe_word_position_idx=observe_word_idx) data = {} data['original tweet'] = [ self.dl.convert2unicode(trim(arr)) for arr in self.tweet_records['word_content_input'] ] data['masked tweet'] = [ self.dl.convert2unicode(trim(arr)) for arr in self.masked_tweet_records['word_content_input'] ] data['first round influences'] = first_round_all_influences data['first round max influential unigram'] = [ self.dl.convert2unicode([ self.tweet_records['word_content_input'][idx][ idx2max_min_wordidx[idx][1]] ]) for idx in range(len(idx2sent_length)) ] data['first round most insignificant unigram'] = [ self.dl.convert2unicode([ self.tweet_records['word_content_input'][idx][ idx2max_min_wordidx[idx][2]] ]) for idx in range(len(idx2sent_length)) ] data['first round max influence'] = first_round_max_influences data['second round influences'] = second_round_all_influences data['second round most influential unigram'] = [ self.dl.convert2unicode([ self.tweet_records['word_content_input'][idx][ second_round_idx2max_min_wordidx[idx][1]] ]) for idx in range(len(idx2sent_length)) ] data['second round max influence'] = second_round_max_influences data[ 'first round max influential unigram ranking in second round'] = second_round_ranking if self.unigram_observe_ids is not None: for unigram_id in self.unigram_observe_ids: data['first round unigram %s ranking' % id2word[unigram_id]] = first_round_unigram_ranking[ unigram_id] pd.DataFrame.from_dict(data).to_csv(self.output_dir, index=False) second_round_rank_stats = defaultdict(int) for num in second_round_ranking: second_round_rank_stats[num] += 1 #first_round_unigram_ranking uses -1 to indicate that the specified unigram not in the tweet #filter out these ranking -1 to get rankings for only those tweets that include the specified unigram first_round_unigram_ranking_included = {} for unigram_id in self.unigram_observe_ids: first_round_unigram_ranking_included[unigram_id] = [] for i in first_round_unigram_ranking[unigram_id]: if i != -1: first_round_unigram_ranking_included[unigram_id].append(i) first_round_rank_stats = {} for unigram_id in self.unigram_observe_ids: stats = defaultdict(int) for num in first_round_unigram_ranking_included[unigram_id]: stats[num] += 1 first_round_rank_stats[unigram_id] = stats return { 'unigram_rank_stats': first_round_rank_stats, 'lime_consistency_stats': second_round_rank_stats, 'first_round_all_influences': first_round_all_influences, 'correspondence': self.tweet_id_considered }