def extract(self, source, paraphrase, position): s = set(tokenize(source)) p = set(tokenize(paraphrase)) p = p.difference(s) s = s.difference(p) return n_similarity(list(p), list(s))
def WordsFF(source, paraphrase, position): stokens = tokenize(source) ptokens = tokenize(paraphrase) word_num_diff = abs(len(stokens) - len(ptokens)) letter_num_diff = len(source) - len(paraphrase) return [word_num_diff, letter_num_diff]
def SemanticSimilarityFF(source, paraphrase, position): sts_sim = sts.similarity(source, paraphrase) sent2vec = sentence2vec.similarity(source, paraphrase) word2vec = n_similarity(tokenize(source), tokenize(paraphrase)) wm = wm_distance(tokenize(source), tokenize(paraphrase)) if math.isinf(wm): wm = 10 return [sts_sim, sent2vec, word2vec, wm]
def top_words(): dictionary = Counter() for i, expression in enumerate(dataset): expr = set(tokenize(expression)) for instance in dataset[expression]: paraphrase = instance[0] dictionary.update([ t for t in tokenize(paraphrase) if t not in expr and t not in stopwords ]) for token in dictionary.most_common(50): print(token)
def extract(self, source, paraphrase, position): p = tokenize(paraphrase) tense = 0 for t, tag in pos_tag(p): if 'VB' in tag: tense = max([tense, TenseFF._pos_to_digit(tag)]) return tense
def TenseFF(source, paraphrase, position): """ VB = 1 VBG = 2 VBN = 3 VBZ = 4 VBD = 5 """ def _pos_to_digit(pos): if pos == 'VB' or pos == 'VBP': return 1 if pos == 'VBG': return 2 if pos == 'VBN': return 3 if pos == 'VBZ': return 4 if pos == 'VBD': return 5 return -1 p = tokenize(paraphrase) tense = 0 for t, tag in pos_tag(p): if 'VB' in tag: tense = max([tense, _pos_to_digit(tag)]) return tense
def PronounFF(source, paraphrase, position): s = set(tokenize(source)) p = set(tokenize(paraphrase)) i = {'i', 'me', 'my', 'mine', 'myself'} you = {'you', 'yours', 'yourself'} he = {'he', 'his', 'him', 'himself'} she = {'she', 'her', 'herself'} we = {'we', 'us', 'our', 'ours', 'ourselves'} they = {'they', 'their', 'them', 'themselves'} s_i = len(s.intersection(i)) > 1 p_i = len(p.intersection(i)) > 1 s_you = len(s.intersection(you)) > 1 p_you = len(p.intersection(you)) > 1 s_he = len(s.intersection(he)) > 1 p_he = len(p.intersection(he)) > 1 s_she = len(s.intersection(she)) > 1 p_she = len(p.intersection(she)) > 1 s_we = len(s.intersection(we)) > 1 p_we = len(p.intersection(we)) > 1 s_they = len(s.intersection(they)) > 1 p_they = len(p.intersection(they)) > 1 dangling_i = s_i == p_i dangling_you = s_you == p_you dangling_he = s_he == p_he dangling_she = s_she == p_she dangling_we = s_we == p_we dangling_they = s_they == p_they return [ int(dangling_i), int(dangling_you), int(dangling_he or dangling_she or dangling_we or dangling_they) ]
def sentence_to_vec(sentence, max_length): from ParaVec import word2vec ret = np.zeros((max_length, 300)) for i, t in enumerate(tokenize(sentence)): vec = word2vec.vector(t) if vec is None: vec = word2vec.vector("unknown") ret[i] = vec if len(ret) == max_length - 1: break return ret
def plural_to_singular_edit(canonical, resources=None): """ corrects plural noun grammatical errors """ for resource in resources: rname = resource.name if resource.resource_type != SINGLETON: continue canonical = canonical.replace(rname, "a {}".format(singular(rname))) canonical = canonical.replace(" the a ", " a ") canonical = canonical.replace(" a a ", " a ") canonical = canonical.replace(" an a ", " a ") canonical = canonical.replace(" a an ", " a ") canonical = LanguageChecker().grammar_corector(canonical, categories=['MISC' ]).lower() tokens = tokenize(canonical, normilize_text=False) ret = [] seen_article = -20 for token in tokens: if token in {"a", "an"}: seen_article = 0 if 3 > seen_article > 0: if not is_singular(token): ret.append(singular(token)) seen_article = 0 continue ret.append(token) seen_article += 1 tokens = [] prev = False for token in reversed(ret): if not is_singular(token): if prev: token = singular(token) prev = True tokens.append(token) ret = reversed(tokens) return " ".join(ret).replace("< <", "<<").replace("> >", ">>")
def process_product_name( product_names: Iterable[str], nlp, token_to_int: Dict, max_length: int, preprocessing_config: TextPreprocessingConfig, ) -> np.ndarray: tokens_all = [ tokenize( preprocess_product_name(text, **dataclasses.asdict(preprocessing_config)), nlp, ) for text in product_names ] tokens_int = [ [token_to_int[t if t in token_to_int else UNK_TOKEN] for t in tokens] for tokens in tokens_all ] return pad_sequences(tokens_int, max_length)
def top_ngrams(ngram): dictionary = Counter() for i, expression in enumerate(dataset): # expr = set(ngrams(tokenize(expression),ngram)) for instance in dataset[expression]: paraphrase = instance[0] for t in ngrams(tokenize(paraphrase.lower()), ngram): phrase = " ".join(t) if phrase in expression.lower(): continue if type(t) is str: tagged = pos_tag([t]) else: tagged = pos_tag(t) for t, tag in tagged: if "VB" in tag: dictionary.update([phrase]) break for token in dictionary.most_common(100): print(token)
def extract(self, source, paraphrase, position): return len(tokenize(source)) - len(tokenize(paraphrase))
def extract(self, source, paraphrase, position): # Levenshtein distance return normalized_damerau_levenshtein_distance(tokenize(source), tokenize(paraphrase))
def _measure(text): words = tokenize(text) sylabs = len([w for w in words if len(syllables(w)) > 2]) return 1.0430 * math.sqrt(sylabs * 30 / 1) + 3.1291
def _measure(text): words = tokenize(text) sylabs = len([w for w in words if len(syllables(w)) > 2]) words = len(words) + 1 return 206.835 - 1.015 * words / 1 - 84.4 * sylabs / words
def _measure(text): words = tokenize(text) complex_words = [w for w in words if len(syllables(w)) >= 3] return 0.4 * (len(words) / 1 + 100 * len(complex_words) / (len(words) + 1))
def Entropy(source, paraphrase, position): s = set(tokenize(source)) p = set(tokenize(paraphrase)) word_diff_entropy = ent.shannon_entropy(" ".join(p.difference(s))) entropy = ent.shannon_entropy(paraphrase) return [entropy, word_diff_entropy]
def extract(self, source, paraphrase, position): return n_similarity(tokenize(source), tokenize(paraphrase))
def main(args): """ Save nx.graph (Gss, Gts,...) and corresponding torch_geometric.data.PairData (via clevr_parse embedder api). """ if (args.input_vocab_json == '') and (args.output_vocab_json == ''): logger.info( 'Must give one of --input_vocab_json or --output_vocab_json') return graph_parser = clevr_parser.Parser( backend='spacy', model=args.parser_lm, has_spatial=True, has_matching=True).get_backend(identifier='spacy') embedder = clevr_parser.Embedder( backend='torch', parser=graph_parser).get_backend(identifier='torch') is_directed_graph = args.is_directed_graph # Parse graphs as nx.MultiDiGraph out_dir, out_f_prefix = _get_out_dir_and_file_prefix(args) checkpoint_dir = f"{out_dir}/checkpoints" utils.mkdirs(checkpoint_dir) questions, img_scenes = get_questions_and_parsed_scenes( args.input_questions_json, args.input_parsed_img_scenes_json) if args.is_debug: set_default_level(10) questions = questions[: 128] # default BSZ is 64 ensuring enought for batch iter logger.debug( f"In DEBUG mode, sampling {len(questions)} questions only..") # Process Vocab # vocab = _process_vocab(args, questions) # Encode all questions and programs logger.info('Encoding data') questions_encoded, programs_encoded, answers, image_idxs = [], [], [], [] question_families = [] orig_idxs = [] # Graphs and Embeddings # data_s_list = [] # List [torch_geometric.data.Data] data_t_list = [] # List [torch_geometric.data.Data] num_samples = 0 # Counter for keeping track of processed samples num_skipped = 0 # Counter for tracking num of samples skipped for orig_idx, q in enumerate(questions): # First See if Gss, Gts are possible to extract. # If not (for e.g., some edges cases like plurality, skip data sample img_idx = q['image_index'] img_fn = q['image_filename'] logger.debug(f"\tProcessing Image - {img_idx}: {img_fn} ...") # q_idx = q['question_index'] # q_fam_idx = q['question_family_index'] ## 1: Ensure both Gs,Gt is parseable for this question sample, o.w. skip img_scene = list( filter(lambda x: x['image_index'] == img_idx, img_scenes))[0] try: Gt, t_doc = graph_parser.get_doc_from_img_scene( img_scene, is_directed_graph=is_directed_graph) X_t, ei_t, e_attr_t = embedder.embed_t( img_idx, args.input_parsed_img_scenes_json) except AssertionError as ae: logger.warning(f"AssertionError Encountered: {ae}") logger.warning(f"[{img_fn}] Excluding images with > 10 objects") num_skipped += 1 continue if Gt is None and ("SKIP" in t_doc): # If the derendering pipeline failed, then just skip the # scene, don't process the labels (and text_scenes) for the image print(f"Got None img_doc at image_index: {img_idx}") print(f"Skipping all text_scenes for imgage idx: {img_idx}") num_skipped += 1 continue s = q['question'] orig_idx = q['question_index'] try: Gs, s_doc = graph_parser.parse(s, return_doc=True, is_directed_graph=is_directed_graph) X_s, ei_s, e_attr_s = embedder.embed_s(s) except ValueError as ve: logger.warning(f"ValueError Encountered: {ve}") logger.warning(f"Skipping question: {s} for {img_fn}") num_skipped += 1 continue if Gs is None and ("SKIP" in s_doc): logger.warning( "Got None as Gs and 'SKIP' in Gs_embd. (likely plural with CLEVR_OBJS label) " ) logger.warning( f"SKIPPING processing {s} for {img_fn} and at {img_idx}") num_skipped += 1 continue # Using ClevrData allows us a debug extension to Data data_s = ClevrData(x=X_s, edge_index=ei_s, edge_attr=e_attr_s) data_t = ClevrData(x=X_t, edge_index=ei_t, edge_attr=e_attr_t) data_s_list.append(data_s) data_t_list.append(data_t) question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(img_idx) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = preprocess_utils.tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = preprocess_utils.encode( question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) has_prog_seq = 'program' in q if has_prog_seq: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = preprocess_utils.tokenize(program_str) program_encoded = preprocess_utils.encode( program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: ans = q['answer'] answers.append(vocab['answer_token_to_idx'][ans]) num_samples += 1 logger.info("-" * 50) logger.info(f"Samples processed count = {num_samples}") if has_prog_seq: logger.info(f"\n[{orig_idx}]: question: {question} \n" f"\tprog_str: {program_str} \n" f"\tanswer: {ans}") logger.info("-" * 50) # ---- CHECKPOINT ---- # if num_samples % args.checkpoint_every == 0: logger.info(f"Checkpointing at {num_samples}") checkpoint_fn_prefix = f"{out_f_prefix}_{num_samples}" _out_dir = f"{checkpoint_dir}/{out_f_prefix}_{num_samples}" utils.mkdirs(_out_dir) out_fpp = f"{_out_dir}/{checkpoint_fn_prefix}" # ------------ Checkpoint .H5 ------------# logger.info( f"CHECKPOINT: Saving checkpoint files at directory: {out_fpp}") save_h5(f"{out_fpp}.h5", vocab, questions_encoded, image_idxs, orig_idxs, programs_encoded, question_families, answers) # ------------ Checkpoint GRAPH DATA ------------# save_graph_pairdata(out_fpp, data_s_list, data_t_list, is_directed_graph=is_directed_graph) logger.info(f"-------------- CHECKPOINT: COMPLETED --------") if (args.max_sample > 0) and (num_samples >= args.max_sample): logger.info(f"len(questions_encoded = {len(questions_encoded)}") logger.info("args.max_sample reached: Completing ... ") break logger.debug(f"Total samples skipped = {num_skipped}") logger.debug(f"Total samples processed = {num_samples}") out_fpp = f"{out_dir}/{out_f_prefix}" ## SAVE .H5: Baseline {dataset}_h5.h5 file (q,p,ans,img_idx) as usual logger.info(f"Saving baseline (processed) data in: {out_fpp}.h5") save_h5(f"{out_fpp}.h5", vocab, questions_encoded, image_idxs, orig_idxs, programs_encoded, question_families, answers) ## ------------ SAVE GRAPH DATA ------------ ## ## N.b. Ensure the len of theses lists are all equals save_graph_pairdata(out_fpp, data_s_list, data_t_list, is_directed_graph=is_directed_graph) logger.info(f"Saved Graph Data in: {out_fpp}_*.[h5|.gpickle|.npz|.pt] ")
def main(args): print('Loading captions') with open(args.input_captions_json, 'r') as f: captions = json.load(f) with open(args.input_neg_captions_json, 'r') as f: neg_captions = json.load(f) with open(args.split_json, 'r') as f: splits = json.load(f) all_imgs = sorted(os.listdir(args.input_image_dir)) captioned_imgs = list(captions.keys()) all_captions = [] for img, caps in captions.items(): all_captions.extend(caps) all_neg_captions = [] for img, caps in neg_captions.items(): all_neg_captions.extend(caps) # Extract train data points train_split = splits['train'] train_imgs = [all_imgs[idx] for idx in train_split] train_captions = [] train_neg_captions = [] for img in train_imgs: cap = captions[img] neg_cap = neg_captions[img] train_captions.extend(cap) train_neg_captions.extend(neg_cap) N = len(all_imgs) N_captioned = len(captions) M = len(all_captions) M_neg = len(all_neg_captions) print('Total images: %d' % N) print('Total captioned images: %d' % N_captioned) print('Total captions: %d' % M) print('Total negative captions: %d' % M_neg) print('Total train images: %d' % len(train_imgs)) print('Total train captions: %d' % len(train_captions)) print('Total train neg captions: %d' % len(train_neg_captions)) # Either create the vocab or load it from disk if args.input_vocab_json == '': print('Building vocab') word_to_idx = build_vocab(train_captions + train_neg_captions, min_token_count=args.word_count_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) else: print('Loading vocab') with open(args.input_vocab_json, 'r') as f: word_to_idx = json.load(f) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(word_to_idx, f) # Encode all captions # First, figure out max length of captions all_cap_tokens = [] max_length = -1 cap_keys = sorted(list(captions.keys())) for img in cap_keys: caps = captions[img] n = len(caps) assert n > 0, 'error: some image has no caption' tokens_list = [] for cap in caps: cap_tokens = tokenize(cap, add_start_token=True, add_end_token=False, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) tokens_list.append(cap_tokens) max_length = max(max_length, len(cap_tokens)) all_cap_tokens.append((img, tokens_list)) all_neg_cap_tokens = [] cap_keys = sorted(list(captions.keys())) for img in cap_keys: neg_caps = neg_captions[img] neg_n = len(neg_caps) assert neg_n > 0, 'error: some image has no caption' neg_tokens_list = [] for neg_cap in neg_caps: neg_cap_tokens = tokenize(neg_cap, add_start_token=True, add_end_token=False, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) neg_tokens_list.append(neg_cap_tokens) all_neg_cap_tokens.append((img, neg_tokens_list)) print('Encoding captions') label_arrays = [] label_start_idx = -np.ones(N, dtype=np.int) label_end_idx = -np.ones(N, dtype=np.int) label_length = np.zeros(M, dtype=np.int) caption_counter = 0 counter = 0 # Then encode for img, tokens_list in all_cap_tokens: i = int(img.split('.')[0].split('_')[-1]) n = len(tokens_list) Li = np.zeros((n, max_length), dtype=np.int) for j, tokens in enumerate(tokens_list): label_length[caption_counter] = len(tokens) caption_counter += 1 tokens_encoded = encode(tokens, word_to_idx, allow_unk=args.allow_unk == 1) for k, w in enumerate(tokens_encoded): Li[j, k] = w # captions are padded with zeros label_arrays.append(Li) label_start_idx[i] = counter label_end_idx[i] = counter + n - 1 counter += n L = np.concatenate(label_arrays, axis=0) # put all labels together assert L.shape[0] == M, "lengths don't match?" assert np.all(label_length > 0), 'error: some captions have no word?' print('Encoding negative captions') neg_label_arrays = [] neg_label_start_idx = -np.ones(N, dtype=np.int) neg_label_end_idx = -np.ones(N, dtype=np.int) neg_label_length = np.zeros(M_neg, dtype=np.int) neg_caption_counter = 0 neg_counter = 0 # Then encode for img, tokens_list in all_neg_cap_tokens: i = int(img.split('.')[0].split('_')[-1]) n = len(tokens_list) Li = np.zeros((n, max_length), dtype=np.int) for j, tokens in enumerate(tokens_list): neg_label_length[neg_caption_counter] = len(tokens) neg_caption_counter += 1 tokens_encoded = encode(tokens, word_to_idx, allow_unk=args.allow_unk == 1) for k, w in enumerate(tokens_encoded): Li[j, k] = w # captions are padded with zeros neg_label_arrays.append(Li) neg_label_start_idx[i] = neg_counter neg_label_end_idx[i] = neg_counter + n - 1 neg_counter += n neg_L = np.concatenate(neg_label_arrays, axis=0) # put all labels together assert neg_L.shape[0] == M_neg, "lengths don't match?" assert np.all(neg_label_length > 0), 'error: some captions have no word?' # Create h5 file print('Writing output') print('Encoded captions array size: ', L.shape) print('Encoded negative captions array size: ', neg_L.shape) with h5py.File(args.output_h5, 'w') as f: f.create_dataset('labels', data=L) f.create_dataset('label_start_idx', data=label_start_idx) f.create_dataset('label_end_idx', data=label_end_idx) f.create_dataset('label_length', data=label_length) f.create_dataset('neg_labels', data=neg_L) f.create_dataset('neg_label_start_idx', data=neg_label_start_idx) f.create_dataset('neg_label_end_idx', data=neg_label_end_idx) f.create_dataset('neg_label_length', data=neg_label_length)
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data') with open(args.input_questions_json, 'r') as f: questions = json.load(f)['questions'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = preprocess_utils.build_vocab( (q['answer'] for q in questions)) question_token_to_idx = preprocess_utils.build_vocab( (q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = preprocess_utils.build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': utils.mkdirs(os.path.dirname(args.output_vocab_json)) with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] for orig_idx, q in enumerate(questions): question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = preprocess_utils.tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = preprocess_utils.encode( question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = preprocess_utils.tokenize(program_str) program_encoded = preprocess_utils.encode( program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) # Create h5 file print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) utils.mkdirs(os.path.dirname(args.output_h5_file)) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers))
def extract(self, source, paraphrase, position): s = set(tokenize(source)) p = set(tokenize(paraphrase)) return 1 - len(p.difference(s)) / (len(p) + 1)
def extract(self, source, paraphrase, position): wm = wm_distance(tokenize(source), tokenize(paraphrase)) if math.isinf(wm): return 100 return wm
def extract(self, source, paraphrase, position): s = set(tokenize(source)) p = set(tokenize(paraphrase)) return ent.shannon_entropy(" ".join(p.difference(s)))
def EditDistanceFF(source, paraphrase, position): levenshtein = editdistance.eval(source, paraphrase) normalized_l = normalized_damerau_levenshtein_distance(source, paraphrase) normalized_d = normalized_damerau_levenshtein_distance( tokenize(source), tokenize(paraphrase)) return [levenshtein, normalized_l, normalized_d]