def text_decomposition(text, lang='de'):
    if lang == 'de':
        nlp = spacy.load('de_core_news_md')
    elif lang == 'en':
        nlp = spacy.load("en_core_web_md")
    elif lang == 'ru':
        nlp = Russian()
        sentencizer = nlp.create_pipe("sentencizer")
        nlp.add_pipe(sentencizer)
    else:
        print("Unsupported language. Choose from ['en', 'de', 'ru']")
        return

    doc = nlp(text)
    sentences = list()
    for sent in doc.sents:
        sentences.append(sent.text)
    return sentences
Пример #2
0
    def spacy_sentence_scores(self) -> Dict[str, float]:
        nlp = Russian()
        sentencizer = nlp.create_pipe('sentencizer')
        nlp.add_pipe(sentencizer)

        raw_text = self.text
        docx = nlp(raw_text)
        stopwords = list(STOP_WORDS)

        word_frequencies = {}
        for word in docx:
            if word.text not in stopwords:
                word = MORPH.parse(word.text)[0].normalized
                if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag):
                    if word.word not in word_frequencies.keys():
                        word_frequencies[word.word] = 1
                    else:
                        word_frequencies[word.word] += 1

        maximum_frequency = max(word_frequencies.values())

        for word in word_frequencies.keys():
            word_frequencies[word] = (word_frequencies[word] / maximum_frequency)
        sentence_list = [sentence for sentence in docx.sents]

        sentence_scores = {}
        for sent in sentence_list:
            for word in sent:
                word = MORPH.parse(word.text)[0].normalized
                if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag):
                    if word.word in word_frequencies.keys():
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word.word]
                        else:
                            sentence_scores[sent] += word_frequencies[word.word]

        return sentence_scores