def adapt_complexity_and_polarity(model, tokenizer, device, adaptation_dto, mean_measures, n_iterations, epsilon, text_characteristics, debug): sentences, _ = com.calc_sentence_similarity(adaptation_dto.adapted_text()) text = adaptation_dto.adapted_text() rel_polar = abs( (adaptation_dto.text_measures()['SENT_ANAL']['POLAR'] - mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR']) / mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR']) rel_read = abs(( adaptation_dto.text_measures()['READ'] - mean_measures['READ'][adaptation_dto.target_pub_type()]) / \ mean_measures['READ'][adaptation_dto.target_pub_type()]) curr_diff = rel_polar + rel_read for s in sentences: if n_iterations == 0 or abs(curr_diff) <= epsilon: break sentences_result = com.split_into_sentences(text) paraphrases = generate_sequences(model, tokenizer, device, s[1]) best_paraphrase = None best_paraphrase_text = None best_paraphrase_diff = None for p in paraphrases: replaced_list = [p if x == s[1] else x for x in sentences_result] replaced_text = " ".join(replaced_list) curr_polar_with_para = text_characteristics.calc_polarity_scores( replaced_text) curr_read_with_para = com.flesch_reading_ease(replaced_text) rel_polar_with_para = abs(( curr_polar_with_para - mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR']) / \ mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR']) rel_read_with_para = abs((curr_read_with_para - mean_measures['READ'][adaptation_dto.target_pub_type()]) / \ mean_measures['READ'][adaptation_dto.target_pub_type()]) curr_diff_with_para = rel_polar_with_para + rel_read_with_para if best_paraphrase is None or (curr_diff_with_para < best_paraphrase_diff): best_paraphrase = p best_paraphrase_text = replaced_text best_paraphrase_diff = curr_diff_with_para if best_paraphrase is not None and best_paraphrase != s[ 1] and curr_diff > best_paraphrase_diff: text = best_paraphrase_text if debug: print("Replacing '", s[1], "' for '", best_paraphrase, "'") print("Relative difference after replacement: ", best_paraphrase_diff) curr_diff = best_paraphrase_diff n_iterations = n_iterations - 1 adaptation_dto.adapted_text(text) return adaptation_dto
def generate(adaptation_dto, mean_measures, debug): text = adaptation_dto.adapted_text() text_measures = adaptation_dto.text_measures() curr_diff = (text_measures['LEN'] - mean_measures['LEN'][adaptation_dto.target_pub_type()]) / \ mean_measures['LEN'][adaptation_dto.target_pub_type()] mean_sent_length = com.mean_sentence_length(text) rel_sent_length = (com.mean_sentence_length(adaptation_dto.orig_text( ))) / mean_measures['LEN'][adaptation_dto.target_pub_type()] # TO-DO if abs(rel_sent_length) < abs(curr_diff): # If we have to generate more than an average sentence, we generate extra content from DB pedia and Wikipedia keywords = keywords_text_rank(text) for keyword in keywords: extra_content = get_extra_content(keyword, debug) # https://owl.purdue.edu/owl/general_writing/academic_writing/paragraphs_and_paragraphing/paragraphing.html#:~:text=Aim%20for%20three%20to%20five,longer%20paragraphs%20for%20longer%20papers. if abs(rel_sent_length) * 5 < abs(curr_diff): # 1.) Generate whole paragraphs that are similar to the given topic text = generate_additional_paragraphs(extra_content, text, debug) else: # 2.) Split to sentences and find similar sentences to input after the similar sentence text = add_similar_sentences(adaptation_dto, mean_measures, extra_content, rel_sent_length, debug) else: pr = Preprocessing() pr.load_data() pr.encode_data() pr.generate_sequence() pr.get_data() # Generate text using RNN networks pred = Prediction(pr.tokenizer, pr.max_length) pred.load_model() sentences_desc_importance, _ = com.calc_sentence_similarity( text, True, debug) # We extend the least important sentence sentence = sentences_desc_importance[len(sentences_desc_importance) - 1] extended_sentence = pred.predict_sequence( sentence[1], math.floor(mean_sent_length / 2)) text = text.replace(sentence[1], extended_sentence) if debug: print("Extended sentence: '" + extended_sentence) return text
def extractive_summarization(adaptation_dto, mean_measures, epsilon, debug): text = adaptation_dto.adapted_text() text_measures = adaptation_dto.text_measures() mean_sent_length = com.mean_sentence_length(text) abs_diff_length = abs( mean_measures['LEN'][adaptation_dto.target_pub_type()] - text_measures['LEN']) summary = com.extractive_summarization(adaptation_dto.adapted_text(), top_n=math.floor(abs_diff_length / mean_sent_length), debug=debug) adaptation_dto.adapted_text(summary) return adaptation_dto
def add_similar_sentences(adaptation_dto, mean_measures, extra_content, rel_sent_length, debug): text = adaptation_dto.adapted_text() text_measures = adaptation_dto.text_measures() list_of_lists_of_sents = list( map(lambda x: tokenize.sent_tokenize(x), extra_content)) curr_diff = (mean_measures['LEN'][adaptation_dto.target_pub_type()] - text_measures['LEN']) / \ mean_measures['LEN'][adaptation_dto.target_pub_type()] extra_sents = [] orig_sents = com.split_into_sentences(text) for x in list_of_lists_of_sents: for y in x: extra_sents.append(y) sim_sents = similar_sentences(orig_sents, extra_sents, debug) while len(sim_sents) > 0 and -rel_sent_length > curr_diff: # Add the most similar sentence next to the original sentence sent = sim_sents[0] if debug: print("Appending \'" + sent[1] + "\' to \'" + sent[0] + "\'") text = text.replace(sent[0], sent[0] + sent[1]) adaptation_dto.adapted_text(text) text_measures = adaptation_dto.text_measures() curr_diff = (mean_measures['LEN'][adaptation_dto.target_pub_type()] - text_measures['LEN']) / \ mean_measures['LEN'][adaptation_dto.target_pub_type()] # Delete the similarity that you've already used sim_sents.pop(0) return text
def similar_sentences(orig_sents, extra_sents, debug): similar_sentences = [] for x in orig_sents: for y in extra_sents: similar_sentences.append((x, y, com.sentence_similarity(x, y))) similar_sentences.sort(key=lambda x: x[2], reverse=True) return similar_sentences
def generate_additional_paragraphs(extra_content, text, debug): weights = list( map(lambda x: com.sentence_similarity(x, text), extra_content)) tuples = list(zip(extra_content, weights)) tuples.sort(key=lambda x: x[1], reverse=True) for x in tuples: text = summarize_wiki(tuples[0][0]) + "\n\n" + text return text
def __init__(self, text, debug): """ Constructor for this class. """ self.edges = {} sentences, similarity_graph = com.calc_sentence_similarity(text, debug=debug) self.nodes = [] for s in sentences: self.nodes.append(SentenceTree(s, debug)) self.build_graph(sentences, similarity_graph, debug)
def __init__(self, text, target_pub_type, debug=False): """ Constructor for this class. """ self._orig_text = text self._adapted_text = text self._target_pub_type = target_pub_type self._entities, self._lower_entities = com.entities_from_text(text) self.text_characteristics = tc(self.target_pub_type()) self.generate_adapted_text(debug) self.debug = debug
def generate_some_text(model, tokenizer, device, input_str, adaptation_dto, text_len=250): while text_len > 0: text_len_curr = 512 if text_len > 512 else text_len sentences = com.split_into_sentences(input_str) text = input_str if len(sentences) < 3 else " ".join( sentences[(len(sentences) - 3):]) input_str = " ".join(sentences[:(len(sentences) - 3)]) n_tokens = len(tokenize.word_tokenize(text)) tokenized_text = tokenizer.encode(text, return_tensors="pt").to(device) if n_tokens <= 0 or len(tokenized_text) <= 0: return input_str max_length = text_len_curr + n_tokens + 30 if tokenized_text.shape[-1] >= max_length: max_length = tokenized_text.shape[-1] + 30 print(text_len) sample_outputs = model.generate(tokenized_text, pad_token_id=50256, do_sample=True, max_length=max_length, min_length=max_length - 30, repetition_penalty=1.2, temperature=0.7, top_k=50, top_p=0.95) input_str = input_str + " " + tokenizer.decode( sample_outputs[0].tolist()) input_str = com.beautify_text(input_str, adaptation_dto) text_len = text_len - text_len_curr return input_str
def adapt_length(model_gen, model_sum, tokenizer_gen, tokenizer_sum, device_gen, device_sum, adaptation_dto, mean_measures, n_iterations, epsilon, debug=False): text_measures = adaptation_dto.text_measures() curr_diff = (text_measures['LEN'] - mean_measures['LEN'][adaptation_dto.target_pub_type()]) / \ mean_measures['LEN'][adaptation_dto.target_pub_type()] if debug: print("Starting difference sum: ", curr_diff) if (mean_measures['LEN'][adaptation_dto.target_pub_type()] < 80): epsilon = abs((8 - mean_measures['LEN'][adaptation_dto.target_pub_type()]) / \ mean_measures['LEN'][adaptation_dto.target_pub_type()]) while n_iterations > 0 and abs(curr_diff) > epsilon: if curr_diff > 0: text = summarize(model_sum, tokenizer_sum, device_sum, adaptation_dto, mean_measures, epsilon, debug) else: text = generate(model_gen, tokenizer_gen, device_gen, adaptation_dto, mean_measures, debug) text = com.beautify_text(text, adaptation_dto, debug) length = len(tokenize.word_tokenize(text)) curr_diff = (length - mean_measures['LEN'][adaptation_dto.target_pub_type()] ) / mean_measures['LEN'][adaptation_dto.target_pub_type()] if debug: print("Relative difference after length manipulation: ", curr_diff) adaptation_dto.adapted_text(text) n_iterations = n_iterations - 1 return adaptation_dto
def adapt_text(mode, model_para, model_gen, model_sum, tokenizer_para, tokenizer_gen, tokenizer_sum, device_para, device_gen, device_sum, text, mean_measures, target_publication_type, nlp, dictionary, tc, debug=False): debug = False epsilon = 0.1 n_iterations_len = 3 adaptation_dto = AdaptationDTO(text, target_pub_type=target_publication_type, debug=debug) text_after_first_length_manipulation = None for i in range(n_iterations_len): # adaptation_dto = p.adapt_complexity_and_polarity(model_para, tokenizer_para, device_para, adaptation_dto, mean_measures, n_iterations, epsilon, debug) # adaptation_dto = lm.adapt_length(adaptation_dto, mean_measures, n_iterations, epsilon, debug) adaptation_dto = sum_nn.adapt_length(model_gen, model_sum, tokenizer_gen, tokenizer_sum, device_gen, device_sum, adaptation_dto, mean_measures, 1, epsilon, debug) n_iterations = round(100 / n_iterations_len) if target_publication_type == 'RES_ARTCL': n_iterations = round(250 / n_iterations_len) if target_publication_type == 'SOC_MED': n_iterations = round(15 / n_iterations_len) if i == 0: text_after_length_manipulation = adaptation_dto.adapted_text() adaptation_dto = sr.adapt_complexity_and_polarity( adaptation_dto, mean_measures, n_iterations, epsilon, nlp, dictionary, tc, debug) n_iterations = round(20 / n_iterations_len) if target_publication_type == 'RES_ARTCL': n_iterations = round(50 / n_iterations_len) if target_publication_type == 'SOC_MED': n_iterations = round(3 / n_iterations_len) if mode == 'PARA': adaptation_dto = p.adapt_complexity_and_polarity( model_para, tokenizer_para, device_para, adaptation_dto, mean_measures, n_iterations, epsilon, tc, debug) beautified = com.beautify_text(adaptation_dto.adapted_text(), adaptation_dto, debug) adaptation_dto.adapted_text(beautified) # if debug: # print("\n\n ------------------------------- ORIGINAL TEXT ------------------------------- \n\n", # text) # print("\n\n ------------------------------- ADAPTED TEXT ------------------------------- \n\n", # beautified) return beautified, text_after_length_manipulation
def summarize(model, tokenizer, device, adaptation_dto, mean_measures, epsilon, debug=False): init_text = adaptation_dto.adapted_text() tokens = tokenize.word_tokenize(init_text) n_blocks = math.floor(len(tokens) / 512) blocks_tokens = [] for i in range(n_blocks): blocks_tokens.append(tokens[i * 512:(i + 1) * 512]) blocks_tokens.append(tokens[(n_blocks) * 512:]) blocks = [] for b in blocks_tokens: tmp = "" for t in b: tmp = tmp + t + " " blocks.append(tmp) summary = "" if debug: print(len(blocks)) a_tokens = round(math.floor(-epsilon * mean_measures["LEN"][adaptation_dto.target_pub_type()] \ + mean_measures["LEN"][adaptation_dto.target_pub_type()]) / (n_blocks + 1)) b_tokens = round(math.ceil(epsilon * mean_measures["LEN"][adaptation_dto.target_pub_type()] \ + mean_measures["LEN"][adaptation_dto.target_pub_type()]) / (n_blocks + 1)) count = 0 for text in blocks: if debug: print(count) count = count + 1 preprocess_text = text.strip().replace("\n", "") t5_prepared_text = "summarize: " + preprocess_text tokenized_text = tokenizer.encode(t5_prepared_text, return_tensors="pt").to(device) summary_ids = model.generate( tokenized_text, num_beams=4, no_repeat_ngram_size=2, min_length=b_tokens if a_tokens > b_tokens else a_tokens, max_length=a_tokens if a_tokens > b_tokens else b_tokens, early_stopping=False, repetition_penalty=2.5, length_penalty=1.50) summary = summary + " " + tokenizer.decode(summary_ids[0], skip_special_tokens=False) summary = com.beautify_text(summary, adaptation_dto) if debug: print("Summarization done") # print("\n\n ------------------------------- ORIGINAL TEXT ------------------------------- \n\n", # adaptation_dto.adapted_text()) # if debug: # print("\n\n ------------------------------- SUMMARY ------------------------------- \n\n", summary) adaptation_dto.adapted_text(summary) if debug: print("Text measures after summarization: ", adaptation_dto.text_measures()) return summary
def adapt_complexity_and_polarity(adaptation_dto, mean_measures, n_iterations, epsilon, nlp, dictionary, text_characteristics, debug): text = adaptation_dto.adapted_text() doc = nlp(text) _, entities = adaptation_dto.entities() stop_words = set(stopwords.words('english')) regex = re.compile('[^a-zA-Z]') filtered_tokens = set([ w for w in doc if w.text not in stop_words and w.text == regex.sub('', w.text) and w.text not in entities ]) polarity_readability = list( map(word_dictionary_item, list(filtered_tokens))) rel_polarity_readability = list( map( lambda x: relative_difference_dict_item( x, mean_measures, adaptation_dto), polarity_readability)) sorted_rel_polarity_readability = sorted(rel_polarity_readability, key=lambda k: k['RP'], reverse=True) n_words = len(filtered_tokens) rel_polar = abs( (adaptation_dto.text_measures()['SENT_ANAL']['POLAR'] - mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR']) / mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR']) rel_read = abs(( adaptation_dto.text_measures()['READ'] - mean_measures['READ'][adaptation_dto.target_pub_type()]) / \ mean_measures['READ'][adaptation_dto.target_pub_type()]) curr_diff = rel_polar + rel_read if debug: print("Starting difference sum: ", curr_diff) ix = 0 while ix < n_iterations and ix < n_words and abs(curr_diff) > epsilon: word = sorted_rel_polarity_readability[ix]['WORD'] tag = sorted_rel_polarity_readability[ix]['TAG'] best_synonym = None best_synonym_diff = None synonyms = dictionary.synonym(word) if synonyms is None: synonyms = dictionary.synonym(word.lower()) for syn in (synonyms if synonyms is not None else []): syn = try_fix_form((word, tag), nltk.pos_tag([syn])[0]) if syn is None: continue text_with_syn = text if re.search('[^a-zA-Z]', syn) is None: text_with_syn = re.sub(r'([^\w]+)' + word + r'([^\w]+)', r'\1' + syn + r'\2', text_with_syn) text_with_syn = re.sub( r'([^\w]+)' + word.lower() + r'([^\w]+)', r'\1' + syn + r'\2', text_with_syn) else: text_with_syn = text_with_syn.replace(word, syn) text_with_syn = text_with_syn.replace(word.lower(), syn) curr_polar_with_syn = text_characteristics.calc_polarity_scores( text_with_syn) curr_read_with_syn = com.flesch_reading_ease(text_with_syn) rel_polar_with_syn = abs(( curr_polar_with_syn - mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR']) / \ mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR']) rel_read_with_syn = abs((curr_read_with_syn - mean_measures['READ'][adaptation_dto.target_pub_type()]) / \ mean_measures['READ'][adaptation_dto.target_pub_type()]) curr_diff_with_syn = rel_polar_with_syn + rel_read_with_syn if best_synonym is None or (curr_diff_with_syn < best_synonym_diff): best_synonym = syn best_synonym_diff = curr_diff_with_syn if best_synonym is not None and best_synonym != word and curr_diff > best_synonym_diff: # We change all occurrences and set new current polarity if re.search('[^a-zA-Z]', best_synonym) is None: text = re.sub(r'([^\w]+)' + word + r'([^\w]+)', r'\1' + best_synonym + r'\2', text_with_syn) text = re.sub(r'([^\w]+)' + word.lower() + r'([^\w]+)', r'\1' + best_synonym + r'\2', text_with_syn) else: text = text_with_syn.replace(word, best_synonym) text = text_with_syn.replace(word.lower(), best_synonym) if debug: print("Replacing '", word, "' for '", best_synonym, "'") print("Relative difference after replacement: ", best_synonym_diff) curr_diff = best_synonym_diff ix = ix + 1 adaptation_dto.adapted_text(text) return adaptation_dto
def adapt_readability(adaptation_dto, mean_measures, n_iterations, epsilon, debug): dictionary = PyDictionary() nlp = spacy.load("en_core_web_sm") text = adaptation_dto.adapted_text() doc = nlp(text) _, entities = adaptation_dto.entities() stop_words = set(stopwords.words('english')) regex = re.compile('[^a-zA-Z]') filtered_tokens = set([ w for w in doc if w.text not in stop_words and w.text == regex.sub('', w.text) and w.text not in entities ]) readability = list(map(word_dictionary_item, list(filtered_tokens))) rel_readability = list( map( lambda x: relative_difference_dict_item( x, mean_measures, adaptation_dto), readability)) sorted_rel_readability = sorted(rel_readability, key=lambda k: k['READ'], reverse=True) n_words = len(filtered_tokens) rel_read = abs((adaptation_dto.text_measures()['READ'] - mean_measures['READ'][adaptation_dto.target_pub_type()]) / mean_measures['READ'][adaptation_dto.target_pub_type()]) curr_diff = rel_read if debug: print("Starting difference sum: ", curr_diff) ix = 0 text_characteristics = tc(adaptation_dto.target_pub_type()) while ix < n_iterations and ix < n_words and abs(curr_diff) > epsilon: word = sorted_rel_readability[ix]['WORD'] tag = sorted_rel_readability[ix]['TAG'] best_synonym = None best_synonym_diff = None best_synonym_read = None synonyms = dictionary.synonym(word) for syn in (synonyms if synonyms is not None else []): syn = try_fix_form((word, tag), nltk.pos_tag([syn])[0]) if syn is None: continue text_with_syn = text text_with_syn = re.sub(r'\b' + word + '\b', syn, text_with_syn) text_with_syn = re.sub(r'([^a-zA-Z]+)' + word + r'([^a-zA-Z]+)', r'\1' + syn + r'\2', text_with_syn) curr_read_with_syn = com.flesch_reading_ease(text_with_syn) rel_read_with_syn = abs(( curr_read_with_syn - mean_measures['READ'][adaptation_dto.target_pub_type()]) / \ mean_measures['READ'][adaptation_dto.target_pub_type()]) curr_diff_with_syn = rel_read_with_syn if best_synonym is None or (curr_diff_with_syn < best_synonym_diff): best_synonym = syn best_synonym_diff = curr_diff_with_syn best_synonym_read = curr_read_with_syn if best_synonym is not None and best_synonym != word and curr_diff > best_synonym_diff: # We change all occurrences and set new current readability text = re.sub(r'\b' + word + '\b', best_synonym, text) text = re.sub(r'([^a-zA-Z]+)' + word + r'([^a-zA-Z]+)', r'\1' + best_synonym + r'\2', text) if debug: print("Replacing '", word, "' for '", best_synonym, "'") print("Relative difference after replacement: ", best_synonym_diff) print("Readability after replacement: ", best_synonym_read) curr_diff = best_synonym_diff ix = ix + 1 adaptation_dto.adapted_text(text) return adaptation_dto