def adapt_complexity_and_polarity(model, tokenizer, device, adaptation_dto,
                                  mean_measures, n_iterations, epsilon,
                                  text_characteristics, debug):
    sentences, _ = com.calc_sentence_similarity(adaptation_dto.adapted_text())
    text = adaptation_dto.adapted_text()
    rel_polar = abs(
        (adaptation_dto.text_measures()['SENT_ANAL']['POLAR'] -
         mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR'])
        /
        mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR'])
    rel_read = abs((
        adaptation_dto.text_measures()['READ'] - mean_measures['READ'][adaptation_dto.target_pub_type()]) / \
               mean_measures['READ'][adaptation_dto.target_pub_type()])

    curr_diff = rel_polar + rel_read
    for s in sentences:
        if n_iterations == 0 or abs(curr_diff) <= epsilon:
            break
        sentences_result = com.split_into_sentences(text)

        paraphrases = generate_sequences(model, tokenizer, device, s[1])
        best_paraphrase = None
        best_paraphrase_text = None
        best_paraphrase_diff = None

        for p in paraphrases:
            replaced_list = [p if x == s[1] else x for x in sentences_result]
            replaced_text = " ".join(replaced_list)

            curr_polar_with_para = text_characteristics.calc_polarity_scores(
                replaced_text)
            curr_read_with_para = com.flesch_reading_ease(replaced_text)

            rel_polar_with_para = abs((
                curr_polar_with_para - mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR']) / \
                                 mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR'])
            rel_read_with_para = abs((curr_read_with_para - mean_measures['READ'][adaptation_dto.target_pub_type()]) / \
                                mean_measures['READ'][adaptation_dto.target_pub_type()])
            curr_diff_with_para = rel_polar_with_para + rel_read_with_para

            if best_paraphrase is None or (curr_diff_with_para <
                                           best_paraphrase_diff):
                best_paraphrase = p
                best_paraphrase_text = replaced_text
                best_paraphrase_diff = curr_diff_with_para

        if best_paraphrase is not None and best_paraphrase != s[
                1] and curr_diff > best_paraphrase_diff:
            text = best_paraphrase_text

            if debug:
                print("Replacing '", s[1], "' for '", best_paraphrase, "'")
                print("Relative difference after replacement: ",
                      best_paraphrase_diff)
            curr_diff = best_paraphrase_diff
        n_iterations = n_iterations - 1
    adaptation_dto.adapted_text(text)

    return adaptation_dto
def generate(adaptation_dto, mean_measures, debug):
    text = adaptation_dto.adapted_text()
    text_measures = adaptation_dto.text_measures()

    curr_diff = (text_measures['LEN'] - mean_measures['LEN'][adaptation_dto.target_pub_type()]) / \
                mean_measures['LEN'][adaptation_dto.target_pub_type()]

    mean_sent_length = com.mean_sentence_length(text)

    rel_sent_length = (com.mean_sentence_length(adaptation_dto.orig_text(
    ))) / mean_measures['LEN'][adaptation_dto.target_pub_type()]  # TO-DO

    if abs(rel_sent_length) < abs(curr_diff):
        # If we have to generate more than an average sentence, we generate extra content from DB pedia and Wikipedia
        keywords = keywords_text_rank(text)
        for keyword in keywords:
            extra_content = get_extra_content(keyword, debug)
            # https://owl.purdue.edu/owl/general_writing/academic_writing/paragraphs_and_paragraphing/paragraphing.html#:~:text=Aim%20for%20three%20to%20five,longer%20paragraphs%20for%20longer%20papers.
            if abs(rel_sent_length) * 5 < abs(curr_diff):
                # 1.) Generate whole paragraphs that are similar to the given topic
                text = generate_additional_paragraphs(extra_content, text,
                                                      debug)
            else:
                # 2.) Split to sentences and find similar sentences to input after the similar sentence
                text = add_similar_sentences(adaptation_dto, mean_measures,
                                             extra_content, rel_sent_length,
                                             debug)

    else:
        pr = Preprocessing()
        pr.load_data()
        pr.encode_data()
        pr.generate_sequence()
        pr.get_data()

        # Generate text using RNN networks
        pred = Prediction(pr.tokenizer, pr.max_length)
        pred.load_model()

        sentences_desc_importance, _ = com.calc_sentence_similarity(
            text, True, debug)
        # We extend the least important sentence
        sentence = sentences_desc_importance[len(sentences_desc_importance) -
                                             1]
        extended_sentence = pred.predict_sequence(
            sentence[1], math.floor(mean_sent_length / 2))
        text = text.replace(sentence[1], extended_sentence)
        if debug:
            print("Extended sentence: '" + extended_sentence)
    return text
def extractive_summarization(adaptation_dto, mean_measures, epsilon, debug):
    text = adaptation_dto.adapted_text()
    text_measures = adaptation_dto.text_measures()
    mean_sent_length = com.mean_sentence_length(text)

    abs_diff_length = abs(
        mean_measures['LEN'][adaptation_dto.target_pub_type()] -
        text_measures['LEN'])

    summary = com.extractive_summarization(adaptation_dto.adapted_text(),
                                           top_n=math.floor(abs_diff_length /
                                                            mean_sent_length),
                                           debug=debug)
    adaptation_dto.adapted_text(summary)
    return adaptation_dto
def add_similar_sentences(adaptation_dto, mean_measures, extra_content,
                          rel_sent_length, debug):
    text = adaptation_dto.adapted_text()
    text_measures = adaptation_dto.text_measures()
    list_of_lists_of_sents = list(
        map(lambda x: tokenize.sent_tokenize(x), extra_content))
    curr_diff = (mean_measures['LEN'][adaptation_dto.target_pub_type()] - text_measures['LEN']) / \
                mean_measures['LEN'][adaptation_dto.target_pub_type()]

    extra_sents = []
    orig_sents = com.split_into_sentences(text)

    for x in list_of_lists_of_sents:
        for y in x:
            extra_sents.append(y)
    sim_sents = similar_sentences(orig_sents, extra_sents, debug)
    while len(sim_sents) > 0 and -rel_sent_length > curr_diff:

        # Add the most similar sentence next to the original sentence
        sent = sim_sents[0]
        if debug:
            print("Appending \'" + sent[1] + "\' to \'" + sent[0] + "\'")
        text = text.replace(sent[0], sent[0] + sent[1])

        adaptation_dto.adapted_text(text)
        text_measures = adaptation_dto.text_measures()
        curr_diff = (mean_measures['LEN'][adaptation_dto.target_pub_type()] - text_measures['LEN']) / \
                mean_measures['LEN'][adaptation_dto.target_pub_type()]

        # Delete the similarity that you've already used
        sim_sents.pop(0)

    return text
def similar_sentences(orig_sents, extra_sents, debug):
    similar_sentences = []
    for x in orig_sents:
        for y in extra_sents:
            similar_sentences.append((x, y, com.sentence_similarity(x, y)))

    similar_sentences.sort(key=lambda x: x[2], reverse=True)
    return similar_sentences
def generate_additional_paragraphs(extra_content, text, debug):
    weights = list(
        map(lambda x: com.sentence_similarity(x, text), extra_content))
    tuples = list(zip(extra_content, weights))
    tuples.sort(key=lambda x: x[1], reverse=True)
    for x in tuples:
        text = summarize_wiki(tuples[0][0]) + "\n\n" + text
    return text
    def __init__(self, text, debug):
        """ Constructor for this class. """
        self.edges = {}

        sentences, similarity_graph = com.calc_sentence_similarity(text,
                                                                   debug=debug)
        self.nodes = []
        for s in sentences:
            self.nodes.append(SentenceTree(s, debug))

        self.build_graph(sentences, similarity_graph, debug)
    def __init__(self, text, target_pub_type, debug=False):
        """ Constructor for this class. """
        self._orig_text = text
        self._adapted_text = text
        self._target_pub_type = target_pub_type

        self._entities, self._lower_entities = com.entities_from_text(text)
        self.text_characteristics = tc(self.target_pub_type())

        self.generate_adapted_text(debug)
        self.debug = debug
def generate_some_text(model,
                       tokenizer,
                       device,
                       input_str,
                       adaptation_dto,
                       text_len=250):
    while text_len > 0:
        text_len_curr = 512 if text_len > 512 else text_len

        sentences = com.split_into_sentences(input_str)

        text = input_str if len(sentences) < 3 else " ".join(
            sentences[(len(sentences) - 3):])
        input_str = " ".join(sentences[:(len(sentences) - 3)])
        n_tokens = len(tokenize.word_tokenize(text))
        tokenized_text = tokenizer.encode(text, return_tensors="pt").to(device)
        if n_tokens <= 0 or len(tokenized_text) <= 0:
            return input_str

        max_length = text_len_curr + n_tokens + 30
        if tokenized_text.shape[-1] >= max_length:
            max_length = tokenized_text.shape[-1] + 30

        print(text_len)

        sample_outputs = model.generate(tokenized_text,
                                        pad_token_id=50256,
                                        do_sample=True,
                                        max_length=max_length,
                                        min_length=max_length - 30,
                                        repetition_penalty=1.2,
                                        temperature=0.7,
                                        top_k=50,
                                        top_p=0.95)
        input_str = input_str + " " + tokenizer.decode(
            sample_outputs[0].tolist())
        input_str = com.beautify_text(input_str, adaptation_dto)

        text_len = text_len - text_len_curr
    return input_str
def adapt_length(model_gen,
                 model_sum,
                 tokenizer_gen,
                 tokenizer_sum,
                 device_gen,
                 device_sum,
                 adaptation_dto,
                 mean_measures,
                 n_iterations,
                 epsilon,
                 debug=False):
    text_measures = adaptation_dto.text_measures()

    curr_diff = (text_measures['LEN'] - mean_measures['LEN'][adaptation_dto.target_pub_type()]) / \
                mean_measures['LEN'][adaptation_dto.target_pub_type()]

    if debug:
        print("Starting difference sum: ", curr_diff)

    if (mean_measures['LEN'][adaptation_dto.target_pub_type()] < 80):
        epsilon = abs((8 - mean_measures['LEN'][adaptation_dto.target_pub_type()]) / \
                mean_measures['LEN'][adaptation_dto.target_pub_type()])
    while n_iterations > 0 and abs(curr_diff) > epsilon:
        if curr_diff > 0:
            text = summarize(model_sum, tokenizer_sum, device_sum,
                             adaptation_dto, mean_measures, epsilon, debug)
        else:
            text = generate(model_gen, tokenizer_gen, device_gen,
                            adaptation_dto, mean_measures, debug)

        text = com.beautify_text(text, adaptation_dto, debug)

        length = len(tokenize.word_tokenize(text))
        curr_diff = (length -
                     mean_measures['LEN'][adaptation_dto.target_pub_type()]
                     ) / mean_measures['LEN'][adaptation_dto.target_pub_type()]
        if debug:
            print("Relative difference after length manipulation: ", curr_diff)
        adaptation_dto.adapted_text(text)
        n_iterations = n_iterations - 1

    return adaptation_dto
示例#11
0
def adapt_text(mode,
               model_para,
               model_gen,
               model_sum,
               tokenizer_para,
               tokenizer_gen,
               tokenizer_sum,
               device_para,
               device_gen,
               device_sum,
               text,
               mean_measures,
               target_publication_type,
               nlp,
               dictionary,
               tc,
               debug=False):
    debug = False
    epsilon = 0.1
    n_iterations_len = 3
    adaptation_dto = AdaptationDTO(text,
                                   target_pub_type=target_publication_type,
                                   debug=debug)

    text_after_first_length_manipulation = None
    for i in range(n_iterations_len):
        # adaptation_dto = p.adapt_complexity_and_polarity(model_para, tokenizer_para, device_para, adaptation_dto, mean_measures, n_iterations, epsilon, debug)
        # adaptation_dto = lm.adapt_length(adaptation_dto, mean_measures, n_iterations, epsilon, debug)

        adaptation_dto = sum_nn.adapt_length(model_gen, model_sum,
                                             tokenizer_gen, tokenizer_sum,
                                             device_gen, device_sum,
                                             adaptation_dto, mean_measures, 1,
                                             epsilon, debug)
        n_iterations = round(100 / n_iterations_len)
        if target_publication_type == 'RES_ARTCL':
            n_iterations = round(250 / n_iterations_len)
        if target_publication_type == 'SOC_MED':
            n_iterations = round(15 / n_iterations_len)
        if i == 0:
            text_after_length_manipulation = adaptation_dto.adapted_text()
        adaptation_dto = sr.adapt_complexity_and_polarity(
            adaptation_dto, mean_measures, n_iterations, epsilon, nlp,
            dictionary, tc, debug)

        n_iterations = round(20 / n_iterations_len)
        if target_publication_type == 'RES_ARTCL':
            n_iterations = round(50 / n_iterations_len)
        if target_publication_type == 'SOC_MED':
            n_iterations = round(3 / n_iterations_len)
        if mode == 'PARA':
            adaptation_dto = p.adapt_complexity_and_polarity(
                model_para, tokenizer_para, device_para, adaptation_dto,
                mean_measures, n_iterations, epsilon, tc, debug)
        beautified = com.beautify_text(adaptation_dto.adapted_text(),
                                       adaptation_dto, debug)
        adaptation_dto.adapted_text(beautified)

    # if debug:
    #     print("\n\n    ------------------------------- ORIGINAL TEXT -------------------------------    \n\n",
    #           text)
    #     print("\n\n    ------------------------------- ADAPTED  TEXT -------------------------------    \n\n",
    #           beautified)

    return beautified, text_after_length_manipulation
def summarize(model,
              tokenizer,
              device,
              adaptation_dto,
              mean_measures,
              epsilon,
              debug=False):
    init_text = adaptation_dto.adapted_text()
    tokens = tokenize.word_tokenize(init_text)
    n_blocks = math.floor(len(tokens) / 512)
    blocks_tokens = []
    for i in range(n_blocks):
        blocks_tokens.append(tokens[i * 512:(i + 1) * 512])

    blocks_tokens.append(tokens[(n_blocks) * 512:])
    blocks = []
    for b in blocks_tokens:
        tmp = ""
        for t in b:
            tmp = tmp + t + " "
        blocks.append(tmp)
    summary = ""
    if debug:
        print(len(blocks))
    a_tokens = round(math.floor(-epsilon * mean_measures["LEN"][adaptation_dto.target_pub_type()] \
                          + mean_measures["LEN"][adaptation_dto.target_pub_type()]) / (n_blocks + 1))
    b_tokens = round(math.ceil(epsilon * mean_measures["LEN"][adaptation_dto.target_pub_type()] \
                         + mean_measures["LEN"][adaptation_dto.target_pub_type()]) / (n_blocks + 1))
    count = 0
    for text in blocks:
        if debug:
            print(count)
        count = count + 1
        preprocess_text = text.strip().replace("\n", "")
        t5_prepared_text = "summarize: " + preprocess_text

        tokenized_text = tokenizer.encode(t5_prepared_text,
                                          return_tensors="pt").to(device)

        summary_ids = model.generate(
            tokenized_text,
            num_beams=4,
            no_repeat_ngram_size=2,
            min_length=b_tokens if a_tokens > b_tokens else a_tokens,
            max_length=a_tokens if a_tokens > b_tokens else b_tokens,
            early_stopping=False,
            repetition_penalty=2.5,
            length_penalty=1.50)

        summary = summary + " " + tokenizer.decode(summary_ids[0],
                                                   skip_special_tokens=False)
        summary = com.beautify_text(summary, adaptation_dto)

        if debug:
            print("Summarization done")
        # print("\n\n    ------------------------------- ORIGINAL TEXT -------------------------------    \n\n",
        #       adaptation_dto.adapted_text())

    # if debug:
    # print("\n\n    ------------------------------- SUMMARY -------------------------------    \n\n", summary)

    adaptation_dto.adapted_text(summary)

    if debug:
        print("Text measures after summarization: ",
              adaptation_dto.text_measures())

    return summary
def adapt_complexity_and_polarity(adaptation_dto, mean_measures, n_iterations,
                                  epsilon, nlp, dictionary,
                                  text_characteristics, debug):

    text = adaptation_dto.adapted_text()

    doc = nlp(text)

    _, entities = adaptation_dto.entities()

    stop_words = set(stopwords.words('english'))

    regex = re.compile('[^a-zA-Z]')
    filtered_tokens = set([
        w for w in doc if w.text not in stop_words
        and w.text == regex.sub('', w.text) and w.text not in entities
    ])

    polarity_readability = list(
        map(word_dictionary_item, list(filtered_tokens)))

    rel_polarity_readability = list(
        map(
            lambda x: relative_difference_dict_item(
                x, mean_measures, adaptation_dto), polarity_readability))

    sorted_rel_polarity_readability = sorted(rel_polarity_readability,
                                             key=lambda k: k['RP'],
                                             reverse=True)

    n_words = len(filtered_tokens)
    rel_polar = abs(
        (adaptation_dto.text_measures()['SENT_ANAL']['POLAR'] -
         mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR'])
        /
        mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR'])
    rel_read = abs((
        adaptation_dto.text_measures()['READ'] - mean_measures['READ'][adaptation_dto.target_pub_type()]) / \
               mean_measures['READ'][adaptation_dto.target_pub_type()])

    curr_diff = rel_polar + rel_read
    if debug:
        print("Starting difference sum: ", curr_diff)

    ix = 0
    while ix < n_iterations and ix < n_words and abs(curr_diff) > epsilon:
        word = sorted_rel_polarity_readability[ix]['WORD']
        tag = sorted_rel_polarity_readability[ix]['TAG']
        best_synonym = None
        best_synonym_diff = None

        synonyms = dictionary.synonym(word)
        if synonyms is None:
            synonyms = dictionary.synonym(word.lower())

        for syn in (synonyms if synonyms is not None else []):
            syn = try_fix_form((word, tag), nltk.pos_tag([syn])[0])
            if syn is None:
                continue

            text_with_syn = text
            if re.search('[^a-zA-Z]', syn) is None:
                text_with_syn = re.sub(r'([^\w]+)' + word + r'([^\w]+)',
                                       r'\1' + syn + r'\2', text_with_syn)
                text_with_syn = re.sub(
                    r'([^\w]+)' + word.lower() + r'([^\w]+)',
                    r'\1' + syn + r'\2', text_with_syn)
            else:
                text_with_syn = text_with_syn.replace(word, syn)
                text_with_syn = text_with_syn.replace(word.lower(), syn)

            curr_polar_with_syn = text_characteristics.calc_polarity_scores(
                text_with_syn)
            curr_read_with_syn = com.flesch_reading_ease(text_with_syn)

            rel_polar_with_syn = abs((
                curr_polar_with_syn - mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR']) / \
                                 mean_measures['SENT_ANAL'][adaptation_dto.target_pub_type()]['POLAR'])
            rel_read_with_syn = abs((curr_read_with_syn - mean_measures['READ'][adaptation_dto.target_pub_type()]) / \
                                mean_measures['READ'][adaptation_dto.target_pub_type()])

            curr_diff_with_syn = rel_polar_with_syn + rel_read_with_syn

            if best_synonym is None or (curr_diff_with_syn <
                                        best_synonym_diff):
                best_synonym = syn
                best_synonym_diff = curr_diff_with_syn

        if best_synonym is not None and best_synonym != word and curr_diff > best_synonym_diff:
            # We change all occurrences and set new current polarity

            if re.search('[^a-zA-Z]', best_synonym) is None:
                text = re.sub(r'([^\w]+)' + word + r'([^\w]+)',
                              r'\1' + best_synonym + r'\2', text_with_syn)
                text = re.sub(r'([^\w]+)' + word.lower() + r'([^\w]+)',
                              r'\1' + best_synonym + r'\2', text_with_syn)
            else:
                text = text_with_syn.replace(word, best_synonym)
                text = text_with_syn.replace(word.lower(), best_synonym)

            if debug:
                print("Replacing '", word, "' for '", best_synonym, "'")
                print("Relative difference after replacement: ",
                      best_synonym_diff)
            curr_diff = best_synonym_diff

        ix = ix + 1

    adaptation_dto.adapted_text(text)

    return adaptation_dto
示例#14
0
def adapt_readability(adaptation_dto, mean_measures, n_iterations, epsilon,
                      debug):
    dictionary = PyDictionary()
    nlp = spacy.load("en_core_web_sm")

    text = adaptation_dto.adapted_text()

    doc = nlp(text)

    _, entities = adaptation_dto.entities()

    stop_words = set(stopwords.words('english'))

    regex = re.compile('[^a-zA-Z]')
    filtered_tokens = set([
        w for w in doc if w.text not in stop_words
        and w.text == regex.sub('', w.text) and w.text not in entities
    ])

    readability = list(map(word_dictionary_item, list(filtered_tokens)))

    rel_readability = list(
        map(
            lambda x: relative_difference_dict_item(
                x, mean_measures, adaptation_dto), readability))

    sorted_rel_readability = sorted(rel_readability,
                                    key=lambda k: k['READ'],
                                    reverse=True)

    n_words = len(filtered_tokens)
    rel_read = abs((adaptation_dto.text_measures()['READ'] -
                    mean_measures['READ'][adaptation_dto.target_pub_type()]) /
                   mean_measures['READ'][adaptation_dto.target_pub_type()])

    curr_diff = rel_read
    if debug:
        print("Starting difference sum: ", curr_diff)

    ix = 0
    text_characteristics = tc(adaptation_dto.target_pub_type())

    while ix < n_iterations and ix < n_words and abs(curr_diff) > epsilon:
        word = sorted_rel_readability[ix]['WORD']
        tag = sorted_rel_readability[ix]['TAG']
        best_synonym = None
        best_synonym_diff = None
        best_synonym_read = None

        synonyms = dictionary.synonym(word)

        for syn in (synonyms if synonyms is not None else []):
            syn = try_fix_form((word, tag), nltk.pos_tag([syn])[0])
            if syn is None:
                continue

            text_with_syn = text

            text_with_syn = re.sub(r'\b' + word + '\b', syn, text_with_syn)
            text_with_syn = re.sub(r'([^a-zA-Z]+)' + word + r'([^a-zA-Z]+)',
                                   r'\1' + syn + r'\2', text_with_syn)

            curr_read_with_syn = com.flesch_reading_ease(text_with_syn)

            rel_read_with_syn = abs((
                curr_read_with_syn - mean_measures['READ'][adaptation_dto.target_pub_type()]) / \
                                 mean_measures['READ'][adaptation_dto.target_pub_type()])

            curr_diff_with_syn = rel_read_with_syn

            if best_synonym is None or (curr_diff_with_syn <
                                        best_synonym_diff):
                best_synonym = syn
                best_synonym_diff = curr_diff_with_syn
                best_synonym_read = curr_read_with_syn

        if best_synonym is not None and best_synonym != word and curr_diff > best_synonym_diff:
            # We change all occurrences and set new current readability
            text = re.sub(r'\b' + word + '\b', best_synonym, text)
            text = re.sub(r'([^a-zA-Z]+)' + word + r'([^a-zA-Z]+)',
                          r'\1' + best_synonym + r'\2', text)

            if debug:
                print("Replacing '", word, "' for '", best_synonym, "'")
                print("Relative difference after replacement: ",
                      best_synonym_diff)
                print("Readability after replacement: ", best_synonym_read)

            curr_diff = best_synonym_diff

        ix = ix + 1

    adaptation_dto.adapted_text(text)

    return adaptation_dto