def save(self, *args, **kwargs): self.state = normalize_text(self.state) self.city = normalize_text(self.city) self.neighborhood = normalize_text(self.neighborhood) self.location = normalize_text(self.location) self.zipcode = normalize_text(self.zipcode) super(Address, self).save(*args, **kwargs)
def parse(klass, row, carrier, areacode, phone_type): row = list(row) # Make row mutable person = Person() person.name = normalize_text(row[2]) address = Address() address.location = normalize_text(row[3]) address.neighborhood = normalize_text(row[4]) address.city = normalize_text(row[5]) address.state = normalize_text(row[6]) phone = Phone() phone.carrier = carrier phone.areacode = areacode phone.type = phone_type try: # document - try CPF person.document = validate_cpf(row[8][-11:]) person.nature = Person.NATURE_CHOICES_PHYSICAL[0] except CPFValidationError: # document - try CNPJ person.document = validate_cnpj(row[8][-14:]) person.nature = Person.NATURE_CHOICES_LEGAL[0] address.zipcode = validate_zipcode(row[7]) phone.number = validate_phone_number(row[1]) return klass(row, person, address, phone)
def create_plot_record(embeddings, plot, movie_id): p = [] sent_lens = [] plot_size = 0 for i, pl in enumerate(plot): words = util.normalize_text(plot[i]) if (len(words) > 0) and (words[0] != ''): p_sent = [] word_count = 0 plot_size += 1 for j, word in enumerate(words): if (j < data_conf.P_MAX_WORD_PER_SENT_COUNT) and (plot_size < data_conf.P_MAX_SENT_COUNT): p_sent.append(util.get_word_vector(embeddings, word, data_conf.EMBEDDING_SIZE)) word_count += 1 sent_lens.append(word_count) p.append(p_sent) return p
def run_creation(model_type, attack, model_folder, examples_folder, instances_to_attack): print("store created examples in %s" % examples_folder) if model_type == "lstm": import movieqa.run_lstm as runner else: import movieqa.run_cnn as runner runner.data_conf.TRAIN_DIR = model_folder load = False check_sents = [] check_found = [] check_num = 0 corr_probs = [] if not tf.gfile.Exists(examples_folder): tf.gfile.MakeDirs(examples_folder) else: checkpoints = glob.glob(examples_folder + "/[!accuracies]*") checkpoints = sorted(checkpoints, reverse=True) latest = checkpoints[0] splitted = latest.split(".txt")[0] check_num = int(splitted[len(splitted) - 1]) + 1 check = open(latest, encoding="utf8") for line in check: parts = line.replace('\n', '').split("\t") check_words = parts[0].split(" ") check_sents.append(check_words) last_prob = float(parts[1]) found = parts[2] if found == 'True': b_found = True else: b_found = False corr_probs.append(last_prob) check_found.append(b_found) load = True emb_dir = runner.data_conf.EMBEDDING_DIR vectors, vocab = util.load_embeddings(emb_dir) rev_vocab = dict(zip(vocab.values(), vocab.keys())) # print(rev_vocab) filename = "adversarial_addAny/common_english.txt" # length of the distractor sentence d = 10 # pool size of common words to sample from for each word in the distractor sentence poolsize = 10 common_words = {} fin = open(filename, encoding="utf8") for line in fin: word = line.replace('\n', '') # print(word) if word in rev_vocab: common_words[word] = rev_vocab[word] else: print( 'ERROR: word "%s" not in vocab. Run add_common_words_to_vocab.py first.' % word) exit(1) with open(instances_to_attack + '/val.pickle', 'rb') as handle: qa = pickle.load(handle) w_s = [] w_choices = [] w_found = [] q_inds = [] pools = [] with open(examples_folder + "/" + str(0 + check_num) + ".txt", "a") as file: for k, question in enumerate(qa): # load question indices q_words = util.normalize_text(question.question) q_ind = [] for word in q_words: q_ind.append(rev_vocab[word]) a_words = [] for i, answer in enumerate(question.answers): if not i == int(question.correct_index): words = util.normalize_text(answer) a_words.extend(words) w = [] w_choice = [] rand_sent = "" for i in range(0, d): if load: c_word = check_sents[k][i] w_index = rev_vocab[c_word] rand_sent += (c_word + " ") else: w_index = random.choice(list(common_words.values())) rand_sent += (vocab[w_index] + " ") w_found.append(False) w.append(w_index) w_choice.append(i) if load: found = check_found[k] w_found.append(found) # file.write(rand_sent+"\t"+str(corr_probs[k])+"\t"+str(found)+"\n") else: found = False w_found.append(found) file.write(rand_sent + "\t" + "1.0" + "\t" + str(found) + "\n") shuffle(w_choice) w_choices.append(w_choice) w_s.append(w) d_pools = [] for j, dj in enumerate(w): pool = [] random_common_words = np.random.choice(list( common_words.values()), poolsize, replace=False) print("Adding common words") pool.extend(random_common_words) if attack == 'addQ' or attack == "addQA": print("Adding question words") for word in q_words: pool.append(rev_vocab[word]) if attack == "addA" or attack == "addQA": print("Adding answer words") for word in a_words: pool.append(rev_vocab[word]) shuffle(pool) d_pools.append(pool) pools.append(d_pools) filepath = instances_to_attack + "/*.tfrecords" filenames = glob.glob(filepath) global_step = tf.contrib.framework.get_or_create_global_step() dataset = tf.contrib.data.TFRecordDataset(filenames) dataset = dataset.map(runner.get_single_sample) dataset = dataset.repeat(poolsize * d) batch_size = 1 dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [5, None], [None], (), [None, None], ())) iterator = dataset.make_one_shot_iterator() next_q, next_a, next_l, next_plot_ids, next_plots, next_q_types = iterator.get_next( ) add_sent = tf.placeholder(tf.int64, shape=[None]) # sent_exp = tf.expand_dims(add_sent,0) m_p = tf.py_func(add_plot_sentence, [next_plots, add_sent], [tf.int64])[0] # m_p = next_plots # m_p = tf.concat([next_plots,sent_exp],axis=0) logits, atts, sent_atts, _ = runner.predict_batch([next_q, next_a, m_p], training=False) probabs = model.compute_probabilities(logits=logits) accuracy_example = tf.reduce_mean( model.compute_accuracies(logits=logits, labels=next_l, dim=1)) to_restore = tf.contrib.slim.get_variables_to_restore( exclude=["embeddings"]) saver = tf.train.Saver(to_restore) p_counts = 0 last_p = '' p_id = 0 f_counter = 0 with tf.Session() as sess: init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) ckpt = tf.train.get_checkpoint_state(runner.data_conf.TRAIN_DIR) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found') _ = sess.run(runner.set_embeddings_op, feed_dict={runner.place: vectors}) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) if not load: accs = np.ones(shape=(len(qa))) else: accs = corr_probs for w_counter in range(0, d): words = np.zeros(shape=(len(qa)), dtype=np.int64) # select next word to optimize greedily next_inds = [] for k, question in enumerate(qa): next_word = w_choices[k].pop() next_inds.append(next_word) words[k] = w_s[k][next_word] # go through whole pool for every question next_ind = 0 for pool_counter in range(0, poolsize): total_acc = 0.0 info = "" for k, question in enumerate(qa): w_copy = [x for x in w_s[k]] print("==============") next_ind = next_inds[k] pool = pools[k][next_ind] pool_ind = pool.pop() print("setting " + str(w_s[k][next_ind]) + " to " + str(pool_ind)) w_copy[next_ind] = pool_ind info = "wordcounter: " + str( w_counter) + " - poolcounter: " + str( pool_counter) + " - question: " + str(k) print(info) acc_val, probs_val, gs_val, q_type_val, q_val, atts_val, sent_atts_val, labels_val, p_val, a_val, p_id_val = sess.run( [ accuracy_example, probabs, global_step, next_q_types, next_q, atts, sent_atts, next_l, m_p, next_a, next_plot_ids ], feed_dict={add_sent: w_copy}) sent = "" for word in w_copy: sent += (" " + vocab[word]) print(sent + " - acc: " + str(acc_val)) corr = np.argmax(labels_val[0]) pred_val = probs_val[0][corr] if (pred_val < accs[k]): word_s = vocab[words[k]] pool_s = vocab[pool_ind] print(pool_s + " (" + str(pred_val) + ") < " + word_s + " (" + str(accs[k]) + ")") words[k] = pool_ind accs[k] = pred_val if acc_val == 0: print("setting" + str(k) + " to true with acc" + str(acc_val) + " and pred " + str(pred_val)) w_found[k] = True f_counter += 1 filename = '' q_s = '' for index in q_val[0]: word = (vocab[index]) q_s += (word + ' ') filename += (word + '_') predicted_probabilities = probs_val[0] labels = labels_val[0] p_id = 'test' path = runner.data_conf.EVAL_DIR + "/plots/" + p_id + "/" + filename if (p_counts < 20): for i, a_att in enumerate(atts_val[0]): # a_att = np.max(a_att, 1) qa_s = q_s + "? (acc: " + str(acc_val) + ")\n " for index in a_val[0][i]: qa_s += (vocab[index] + ' ') lv = " (label: " + str(int( labels[i])) + " - prediction: " + (str( "%.2f" % (predicted_probabilities[i] * 100))) + "%)" qa_s += lv a_sents = [] y_labels = [] for j, att in enumerate(a_att): a_s = [] y_labels.append( str("%.2f" % (sent_atts_val[0][i][j] * 100)) + "%") for index in p_val[0][j]: a_s.append(vocab[index]) a_sents.append(a_s) util.plot_attention(np.array(a_att), np.array(a_sents), qa_s, y_labels, path, filename) last_p = p_id p_counts += 1 total_acc += acc_val print(total_acc / (k + 1)) with open(examples_folder + "/accuracies.txt", "a") as file: file.write(info + " - " + str(total_acc / (len(qa))) + "\n") with open( examples_folder + "/" + str(w_counter + check_num + 1) + ".txt", "a") as file: for k, question in enumerate(qa): w_s[k][next_ind] = words[k] sent = "" for word in w_s[k]: sent += (vocab[word] + " ") file.write(sent + "\t" + str(accs[k]) + "\t" + str(w_found[k]) + "\n")
def create_movieqa_data(qa_json_file, name, outfolder, embeddings, qa_ids=None): valid_count = 0 movie.cfg.QA_JSON = qa_json_file print("Preprocessing qa file and creating records for %s" % movie.cfg.QA_JSON) mqa = movie.DataLoader() story, qa = mqa.get_story_qa_data(name, 'split_plot') set_path = outfolder + "/" + name + ".tfrecords" writer = tf.python_io.TFRecordWriter(set_path) # filter questions by ids if qa_ids: qa = filter_qa(qa, qa_ids) print("Selected %d questions based on %d provided ids" % (len(qa), len(qa_ids))) with open(os.path.join(outfolder, 'val.pickle'), 'wb') as handle: pickle.dump(qa, handle) for k, question in enumerate(qa): q = [] ans = [] l = np.zeros(shape=[5], dtype=float) ex = tf.train.SequenceExample() words = util.normalize_text(question.question) # lowercase now words = [word.lower() for word in words] movie_id = question.imdb_key question_size = len(words) if name != "test": l[question.correct_index] = 1.0 if words[0] in util.question_types: question_type = util.question_types[words[0]] else: question_type = -1 ex.context.feature["question_type"].int64_list.value.append( question_type) for i, word in enumerate(words): if i < data_conf.Q_MAX_WORD_PER_SENT_COUNT: w_vec = (util.get_word_vector(embeddings, word, data_conf.EMBEDDING_SIZE)) if not w_vec: w_vec = (util.get_word_vector(embeddings, word, data_conf.EMBEDDING_SIZE)) q.append(w_vec) if not movie_id in plot_dict: plot = story.get(movie_id) p_word_ids = create_plot_record(embeddings, plot, movie_id) plot_dict[movie_id] = p_word_ids else: p_word_ids = plot_dict[movie_id] for i, answer in enumerate(question.answers): a = [] words = util.normalize_text(answer) for j, word in enumerate(words): if j < data_conf.Q_MAX_WORD_PER_SENT_COUNT: w_vec = (util.get_word_vector(embeddings, word, data_conf.EMBEDDING_SIZE)) if not w_vec: w_vec = (util.get_word_vector( embeddings, word, data_conf.EMBEDDING_SIZE)) a.append(w_vec) ans.append(a) q_type_feature = tf.train.Feature(int64_list=tf.train.Int64List( value=[question_type])) q_size_feature = tf.train.Feature(int64_list=tf.train.Int64List( value=[question_size])) movie_id_feature = tf.train.Feature(bytes_list=tf.train.BytesList( value=[str.encode(movie_id)])) label_list_feature = [ tf.train.Feature(float_list=tf.train.FloatList(value=[label])) for label in l ] answer_list_feature = [ tf.train.Feature(int64_list=tf.train.Int64List(value=aw)) for aw in ans ] plot_list_feature = [ tf.train.Feature(int64_list=tf.train.Int64List(value=pl)) for pl in p_word_ids ] question_list_feature = [ tf.train.Feature(int64_list=tf.train.Int64List(value=q)) ] feature_list = { "labels": tf.train.FeatureList(feature=label_list_feature), "answers": tf.train.FeatureList(feature=answer_list_feature), "question": tf.train.FeatureList(feature=question_list_feature), "plot": tf.train.FeatureList(feature=plot_list_feature), } context = tf.train.Features( feature={ "question_type": q_type_feature, "question_size": q_size_feature, "movie_id": movie_id_feature }) feature_lists = tf.train.FeatureLists(feature_list=feature_list) example_sequence = tf.train.SequenceExample( feature_lists=feature_lists, context=context) serialized = example_sequence.SerializeToString() writer.write(serialized) valid_count += 1 print(name + ' set completed - files written to ' + set_path)