def filtering_via_syntactic_and_semantic_information_replace(pert_sent, synonyms): """Filter sentences by synonyms and constituency structure for PaInv-Replace. Returns a dictionary of original sentence to list of filtered sentences """ stopWords = list(set(stopwords.words('english'))) syn_dic = {} filtered_sent = {} stemmer = SnowballStemmer("english") lemmatizer = WordNetLemmatizer() tokenizer = TreebankWordTokenizer() detokenizer = TreebankWordDetokenizer() # Run CoreNLPPArser on local host eng_parser = CoreNLPParser('http://localhost:9000') for original_sentence in list(pert_sent.keys()): # Create a dictionary from original sentence to list of filtered sentences filtered_sent[original_sentence] = [] tokens_or = tokenizer.tokenize(original_sentence) # Constituency tree of source sentence source_tree = [i for i, in eng_parser.raw_parse_sents([original_sentence])] # Get lemma of each word of source sentence source_lem = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(original_sentence)] new_sents = pert_sent[original_sentence] target_trees_GT = [] num = 50 # Generate constituency tree of each generated sentence for x in range(int(len(new_sents)/num)): target_trees_GT[(x*num):(x*num)+num] = [i for i, in eng_parser.raw_parse_sents(new_sents[(x*num):(x*num)+num])] x = int(len(new_sents)/num) target_trees_GT[(x*num):] = [i for i, in eng_parser.raw_parse_sents(new_sents[(x*num):])] for x in range(len(new_sents)): s = new_sents[x] target_lem = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(s)] # If sentence is same as original sentence then filter that if s.lower()==original_sentence.lower(): continue # If there constituency structure is not same, then filter if treeDistance(target_trees_GT[x],source_tree[0]) > 1: continue # If original sentence and generate sentence have same lemma, then filter if target_lem == source_lem: continue # Tokens of generated sentence tokens_tar = tokenizer.tokenize(s) for i in range(len(tokens_or)): if tokens_or[i]!=tokens_tar[i]: word1 = tokens_or[i] word2 = tokens_tar[i] word1_stem = stemmer.stem(word1) word2_stem = stemmer.stem(word2) word1_base = WordNetLemmatizer().lemmatize(word1,'v') word2_base = WordNetLemmatizer().lemmatize(word2,'v') # If original word and predicted word have same stem, then filter if word1_stem==word2_stem: continue # If they are synonyms of each other, the filter syn1 = synonyms(word1_base) syn2 = synonyms(word2_base) if (word1 in syn2) or (word1_base in syn2) or (word2 in syn1) or (word2_base in syn1): continue if ((word1 in stopWords) or (word2 in stopWords) or (word1_stem in stopWords) or (word2_stem in stopWords) or (word1_base in stopWords) or (word2_base in stopWords)): continue filtered_sent[original_sentence].append(s) return filtered_sent
# initialize stop words in the source language stopwordsS = set(stopwords.words('english')) # get original sentences from file ori_source_sents = [] with open(input_file) as file: for line in file: ori_source_sents.append(line.strip()) # a dictionay of RTIs, each key is a containing RTI, each value is a list of RTIs contained by the key np_invariantsD = dict() # parse the original source sentences ori_source_trees = [ i for (i, ) in eng_parser.raw_parse_sents( ori_source_sents, properties={'ssplit.eolonly': 'true'}) ] # find RTIs for t, super_str in zip(ori_source_trees, ori_source_sents): FindInvariant(t, super_str, np_invariantsD, stopwordsS, num_word_th) print('\n invariants constructed\nThere are', len(np_invariantsD), 'invariants. Filtering') # Since np sometimes could be nearly identical to the original sentence (i.e., only punctuations different), we filter those duplicate pairs here. chartosent = dict() for sent in ori_source_sents: sent_no_pun = ''.join( sent.translate(str.maketrans('', '', string.punctuation)).strip().split())