Exemplo n.º 1
0
def get_keywords(text):
    keywords = []

    term_extractor = TermExtractor()
    for term in term_extractor(text):
        keywords.append(term.normalized)
    return keywords
Exemplo n.º 2
0
def echo(update, context):
    term_extractor = TermExtractor()
    # definition_list: List[str] = list()
    definition_list: List[List] = list()
    for term in term_extractor.__call__(
            update.message.text,
            # nested=True
            nested=False):
        # definition_list.append(term.normalized)
        # definition_list.append([term.normalized, term.count])
        words: List[List] = list()
        for word in term.words:
            words.append([
                word.parsed.word,
                str(word.parsed.tag),
                word.parsed.normal_form,
                word.parsed.score,
                repr(word.parsed.methods_stack),
            ])
        definition_list.append([
            term.normalized,
            term.count,
            term.word_count,
            json.dumps(words),
        ])

    # print(definition_list)
    # repr_definition_list = repr(definition_list)
    repr_definition_list = json.dumps(definition_list)
    # repr_definition_list = json.dumps(definition_list, ensure_ascii=False).encode('utf8')

    # отладка
    # f = open('/usr/src/app/src/log.txt', 'w')
    # f.write('definition_list = ' + repr_definition_list + '\n')
    # f.close()

    link_slug = insert(links, repr_definition_list)
    text = 'Обработка текста завершена. Результат доступен по ссылке: %s/?link=%s' % (
        host, link_slug)

    context.bot.send_message(
        chat_id=update.effective_chat.id,
        # text=definition_list
        # text='Извлечение ключевых слов успешно завершено! Посмотреть результат Вы можете по данной ссылке: [ссылка]'
        # text=repr_definition_list
        # text=link_slug
        text=text)
Exemplo n.º 3
0
 def __init__(self, stopwords_file=None, stopwords=None):
     # stopwords_file [string] - path to file containing stopwords
     assert stopwords_file or stopwords
     
     stopwords_file = stopwords_file or STOPWORDS_FILE
     self.stopwords = stopwords or load_wordset(stopwords_file)
     self.term_extractor = TermExtractor()
     self.morph = self.term_extractor.parser.morph
     self._morph_parse_cache = {}
Exemplo n.º 4
0
def _detect_topic_from_caption(caption: str) -> list:
    term_extractor = TermExtractor()

    themes = []
    for term in term_extractor(caption, limit=3):
        if len(term.normalized) <= MAX_SYMBOLS_FOR_TOPIC:
            themes.append(term.normalized)

    return themes
Exemplo n.º 5
0
def get_lexemas_from_text(cursor, atext=""):
    term_extractor = TermExtractor()
    mystem = pymystem3.Mystem()
    lexemas = []
    for term in term_extractor(atext):
        for lexema in str(term.normalized).split(" "):
            lexema = mystem.analyze(lexema)[0]['analysis'][0]['lex']
            id_lexema = lexema_id_by_inf(cursor, lexema)
            lexemas += [id_lexema]
    return lexemas
Exemplo n.º 6
0
 def __init__(self):
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
     self.syntax_parser = NewsSyntaxParser(self.emb)
     self.ner_tagger = NewsNERTagger(self.emb)
     self.names_extractor = NamesExtractor(self.morph_vocab)
     self.doc = []
     self.term_extractor = TermExtractor()
Exemplo n.º 7
0
def get_pict(text):	
	term_extractor = TermExtractor()
	for term in term_extractor(text, nested=True):
		norm_term = term.normalized
		print(norm_term)
		result = find_pict(norm_term)
		if result:
			return result, norm_term
	return
		
#get_pict('Съешь ещё этих мягких французских булок да выпей же чаю.')
Exemplo n.º 8
0
        def getRuCollocations(self, text, rules, number):
            collocations = []
            termExctractor = TermExtractor()
            for term in termExctractor(text):
                collocations.append(term)

            collocations = self.filterRuCollocations(collocations, rules, number)
            # filter collocations
            collocations = list(map(lambda x: x.normalized, collocations))

            return collocations
Exemplo n.º 9
0
def get_words_from_files(cid_list, media_path):
    term_extractor = TermExtractor()
    morph_analyzer = pymorphy2.MorphAnalyzer()
    inflector = PhraseInflector(morph_analyzer)
    futures_groups = []
    for cid in cid_list:
        course_path = os.path.join(media_path, str(cid))
        futures = []
        with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
            for module_name in os.listdir(course_path):
                module_path = os.path.join(course_path, module_name)
                for file_name in os.listdir(module_path):
                    file_path = os.path.join(module_path, file_name)
                    futures.append(
                        executor.submit(get_words_from_file, term_extractor,
                                        morph_analyzer, inflector, file_path))
        futures_groups.append(futures)
    """
    words_num = 0
    pages_num = 1
    phrases_stat = {}
    """

    words_groups = []
    phrases_groups = []
    for futures in futures_groups:
        words = []
        phrases = []
        text = ''
        for future in futures:
            w, p = future.result()
            """
            words_num += len(w)
            pages = words_num // 500
            if pages > 0:
                pages_num += pages
                phrases_stat = {k: v + [0 for _ in range(pages)] for k, v in phrases_stat.items()}
            
            for phrase in p:
                if phrase[0] not in phrases_stat:
                    phrases_stat[phrase[0]] = [0 for _ in range(pages_num)]
                for i in range(1, pages + 1):
                    phrases_stat[phrase[0]][-i] += phrase[1] / pages
            """

            # w, txt = future.result()
            words += w
            # text += txt
            phrases += p
        words_groups.append(words)
        phrases_groups.append(phrases)
        # phrases_groups.append(text)

    return words_groups, phrases_groups
Exemplo n.º 10
0
 def __idf__(self, textsList):
     korpDic = {}
     for text in textsList:
         term_extractor = TermExtractor()
         for term in term_extractor(text, nested='true'):
             if term.normalized in korpDic:
                 korpDic[term.normalized] = korpDic[term.normalized] + 1
             else:
                 korpDic[term.normalized] = 1
     for key in korpDic:
         korpDic[key] = math.log2(len(textsList) / korpDic[key])
     return korpDic
Exemplo n.º 11
0
 def __tf__(self, text):
     wordDic = {}
     termsQuantity = 0
     term_extractor = TermExtractor()
     for term in term_extractor(text, nested='true'):
         termsQuantity += term.count
     for term in term_extractor(
             text,
             nested='true',
             weight=lambda term: term.count / termsQuantity):
         wordDic[term.normalized] = term.count / termsQuantity
     return wordDic
Exemplo n.º 12
0
 def __simpliFrequency__(self, textsList):
     korpDic = {}
     for text in textsList:
         term_extractor = TermExtractor()
         for term in term_extractor(text, nested='true'):
             if term.normalized in korpDic:
                 korpDic[term.
                         normalized] = korpDic[term.normalized] + term.count
             else:
                 korpDic[term.normalized] = 1
     for key in korpDic:
         korpDic[key] = korpDic[key] / len(textsList)
     return korpDic
Exemplo n.º 13
0
 def __tf__(self, text):
     wordDic = {}
     termsQuantity = 0
     term_extractor = TermExtractor()
     for term in term_extractor(text, nested='true'):
         termsQuantity += term.count
     for term in term_extractor(
             text,
             nested='true',
             weight=lambda term: term.count / termsQuantity):
         norm = re.sub(r'[^\w\s]+', r' ', term.normalized).strip()
         wordDic[norm] = term.count / termsQuantity
     return wordDic
Exemplo n.º 14
0
def get_key_words_list(text):
    """Получает из текста список всех ключевых слов"""
    # Проходим извлекателем ключевых слов
    term_extractor = TermExtractor()
    terms = term_extractor(text)
    # структура датафрейма
    dataframe_structure = {'key_word': [], 'count': []}
    for term in terms:
        dataframe_structure['key_word'].append(term.normalized)
        dataframe_structure['count'].append(term.count)

    result = pd.DataFrame(dataframe_structure)
    return result
Exemplo n.º 15
0
def main():
    file_path = str(
        input(
            "Введите путь к текстовому файлу по следующему формату C:\\Users...\\FileName.txt:\n"
        ))
    if not os.path.exists(file_path):
        print("Указанный файл не существует")
    else:
        with open(file_path, "r") as file:
            content = file.read()  # считывание содержимого файла
        term_extractor = TermExtractor(
        )  # использование библиотеки rutermextract - деление текста на слова
        # приведение в нормальную форму, вычисление ключевых слов
        output(term_extractor, content)  # вывод данных
Exemplo n.º 16
0
def theme(text):
    """
    function of extracting the key themes from the text
    on entry accepts:
        text ------------ str, text to extract the topic
    returns to output:
        theme ----------- str, three main topics from the text
    """

    term_extractor = TermExtractor()
    list_theme = []
    for term in term_extractor(text):
        list_theme.append(term.normalized)
    theme_text = ' | '.join(list_theme[:3])
    return theme_text
def key_Word(text):

    key_words = {}
    keys = []
    values = []
    term_extractor = TermExtractor()
    for term in term_extractor(text):
        keys.append(term.normalized)
        values.append(term.count)

    for i in keys:
        for x in values:
            key_words[i] = x

    return key_words
Exemplo n.º 18
0
 def __seasNotNormaliseIdf__(self, serList):
     korpDic = {}
     for ser in serList:
         term_extractor = TermExtractor()
         temporarDict = {}
         for term in term_extractor(ser.listOfTexts, nested='true'):
             norm = re.sub(r'[^\w\s]+', r' ', term.normalized).strip()
             if norm not in temporarDict:
                 if norm in korpDic:
                     korpDic[norm] = korpDic[norm] + 1
                 else:
                     korpDic[norm] = 1
                 temporarDict[norm] = 1
     for key in korpDic:
         korpDic[key] = korpDic[key]
     return korpDic
Exemplo n.º 19
0
def get_similarity(arg1, arg2):

    term_extractor = TermExtractor()

    # try:
    #     subterms1 = term_extractor(arg1, nested=True)
    # except TypeError as exep:
    #     print exep.args
    #     x = exep
    #     print 'arg1 = ', x

    subterms1 = term_extractor(arg1, nested=True)
    subterms2 = term_extractor(arg2, nested=True)

    ratio = 0

    average_length = (len(subterms1) + len(subterms2)) / 2
    if average_length == 0:
        return 0

    set1 = set(subterms1)
    set2 = set(subterms2)

    intersection = set.intersection(set1, set2)

    ratio += len(intersection)

    #print "ratio: %f" % ratio
    # for term0 in intersection:
    #     print "intersection %s" % term0.normalized

    set1_ = set.symmetric_difference(set1, intersection)
    set2_ = set.symmetric_difference(set2, intersection)

    for term1 in set1_:
        for term2 in set2_:
            # rat = fuzz.ratio(term1.normalized, term2.normalized)
            rat = fuzz.partial_ratio(term1.normalized, term2.normalized)
            # print rat
            if rat > 30:
                ratio += rat * 0.01

    metric = ratio / average_length  # TODO: mean or smth else
    # metric = np.mean(ratio)
    # print "similarity: %f" % metric

    return metric
Exemplo n.º 20
0
def tag_mystem(text, mapping, m):
    text = re.sub(r'[A-z&=;]+', r'', text)
    text = ' '.join(dell_stopwords(text))

    term_extractor = TermExtractor()
    limit = 30
    new_text = ' '.join([term.normalized for term in term_extractor(text, limit)])

    tagged = []
    for w in new_text.split():
        p = morph.parse(w)[0]
        POS = p.tag.POS
        if POS in mapping:
            tagged.append(p.normal_form + '_' + mapping[POS])
        else:
            tagged.append(p.normal_form + '_X')

    return np.array(tagged)
Exemplo n.º 21
0
    def get_list_skills(self, vacancy):
        """Функция из записи получает список ключевых навыков"""

        # Получаем ключевые слова
        term_extractor = TermExtractor()
        skills = [
            term.normalized
            for term in term_extractor(vacancy['description'], limit=10)
        ]

        # Если присуствует запись 'key_skills' от из нее извлекаем ключевые навыки
        # и добавляем к другим навыкам полученным через rutermextract
        if (not pd.isnull(vacancy['key_skills'])):
            skills = list(
                set(skills) | set(vacancy['key_skills'].lower().split(' | '))
            )  # Убираем совпадения

        return skills
Exemplo n.º 22
0
def theme(theme):
    term_extractor = TermExtractor()
    themestr = term_extractor(theme, nested=True, strings=True)
    InFile = open (settings.MEDIA_ROOT + tm + 'Ref.txt','r')
    OutFile = open (settings.MEDIA_ROOT + tm + 'Refing.txt','w')
    for line in InFile:
        ruse = 0
        for i in range (0,len(themestr)):
            for k in range(0, len(library['predOpr']), 1):
                if themestr[i].lower() in line.lower() and ruse == 0 and library['predOpr'][k] in line:
                    OutFile.write(3 * '  ' + prevLine.replace('\n', '') + line)
                    prevLine = ''
                    ruse+=1
                elif themestr[i].lower() in line.lower() and ruse == 0:
                    OutFile.write(line)
                    ruse += 1
        if ruse == 0:
            prevLine = line
    OutFile.close()
    InFile.close()
    os.remove(settings.MEDIA_ROOT + tm + 'Ref.txt')
Exemplo n.º 23
0
    def __init__(self, vacancy):
        """Конструктор класса Vacancy
        vacancy - запись вакансии
        Создает объект Vacancy на основе записи
        """

        term_extractor = TermExtractor()

        self.name = vacancy['name.lemm']
        self.name_pattern = re.compile("|".join(
            [term.normalized for term in term_extractor(self.name, limit=10)]))
        self.experience = vacancy['experience']
        self.skills = [
            Skill(skill, 'skill') for skill in self.get_list_skills(vacancy)
        ]

        experience_name = vacancy['experience.name']

        print(
            f'Вакансия "{self.name}" успешно создана. Опыт {experience_name}. Ключевые навыки: {["".join(str(x.name)) for x in self.skills]}'
        )
Exemplo n.º 24
0
def get_similarity(arg1, arg2):

    arg1 = str(arg1)
    arg2 = str(arg2)

    arg1 = arg1.replace('\n', '')
    arg2 = arg2.replace('\n', '')

    term_extractor = TermExtractor()

    subterms1 = term_extractor(arg1, nested=True)
    subterms2 = term_extractor(arg2, nested=True)

    ratio = 0

    average_length = (len(subterms1) + len(subterms2)) / 2
    if average_length == 0:
        return 0

    set1 = set(subterms1)
    set2 = set(subterms2)

    intersection = set.intersection(set1, set2)

    ratio += len(intersection)

    set1_ = set.symmetric_difference(set1, intersection)
    set2_ = set.symmetric_difference(set2, intersection)

    # for term1 in set1_:
    #     for term2 in set2_:
    #         # rat = fuzz.ratio(term1.normalized, term2.normalized)
    #         rat = fuzz.partial_ratio(term1.normalized, term2.normalized)
    #         if rat > 30:
    #             ratio += rat*0.01

    metric = ratio / average_length  # TODO: mean or smth else
    # metric = np.mean(ratio)

    return metric
Exemplo n.º 25
0
def keywords_extraction(text):
    text = text.replace('\n', ' ')
    text = text.replace('  ', ' ')
    text = text.replace('-', '')
    lang = detect(text)
    link = ''
    if lang == 'ru':
        term_extractor = TermExtractor()
        for term in term_extractor(text):
            if link:
                link = link + ', ' + term.normalized
            else:
                link = term.normalized
    elif lang == 'en':
        blob = TextBlob(text)
        #for term in [stem(n) for n,t in blob.tags if t == 'NN' or t == 'NNS']:
        for term in [n for n, t in blob.tags if t == 'NN' or t == 'NNS']:
            if link:
                link = link + ', ' + term
            else:
                link = term
    return text, link
Exemplo n.º 26
0
def tokenize(sentences):
    arr = []
    arr2 = []
    i = 0
    h = 1
    j = 0
    morph = pymorphy2.MorphAnalyzer()
    term_extractor = TermExtractor()
    words = nltk.word_tokenize(sentences)
    for term in term_extractor(sentences):
        arr.append(term.normalized)
    while i < len(words):
        n = morph.parse(words[i])[0]
        tagg = n.tag.POS
        if (tagg == 'NOUN') or (tagg == 'ADJF'):
            norm = morph.parse(words[i])[0].inflect({'sing', 'nomn'}).word
        else:
            norm = morph.parse(words[i])[0].normal_form
        h = 1
        while j < len(arr):
            if (norm in arr[j]) and (tagg != 'PREP') and (tagg != 'CONJ') and (tagg != 'INTJ'):
                arr2.append(arr[j])
                s = arr[j].split(' ')
                length = len(s)
                if (length > 1):
                    h = length
                else:
                    h = 1
            j += 1
        j = 0
        if tagg == 'VERB':
            arr2.append(words[i])
        i += h
    print("\n", 'Выделенные коллокации', "\n")
    print(arr2)
    return arr2
Exemplo n.º 27
0
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from natasha import NamesExtractor
from rutermextract import TermExtractor
import rutermextract
from stop_words import get_stop_words


term_ex = TermExtractor()
names_ex = NamesExtractor()
stop_words = get_stop_words('russian')


def sort_of_list_by_count(lst):
    d = {}
    for word in lst:
        d[word] = 1 if word not in d.keys() else d[word]+1
    sortedD = sorted(d.items(), key=lambda x: x[1], reverse=True)
    
    return [x[0] for x in sortedD]


def data_to_text(data):
    text_serie = data['text'].dropna()
    text_serie.apply(lambda x: x.rstrip())
    text = text_serie.to_string()
    # text.lower()
    regex = re.compile('[^а-яА-я]')
    text = regex.sub(' ', text)
Exemplo n.º 28
0
def make_desc_title(self, request, queryset):

    idf = dict()
    term_extractor = TermExtractor()

    for obj in queryset:
        #val1=strip_tags(obj.main)
        keyword = ''
        stattext = ''
        descripshion = ''
        descripshion_stat = ''
        mytags = ''
        count_tags = 0
        ttext = ''

        count_word = len(set(strip_tags(obj.main).split()))

        kw = []

        for term in term_extractor(strip_tags(obj.main),
                                   10,
                                   weight=lambda term: idf.get(
                                       term.normalized, 1.0) * term.count):
            stattext += term.normalized + '  :' + str(
                term.count) + ' тошнота- ' + str(
                    float('{:.2f}'.format(
                        term.count / count_word * 100 * 7))) + '%\n'
            kw.append(term.normalized)
            keyword += term.normalized + ','
            if count_tags < 5:
                mytags += term.normalized + ','
                count_tags = count_tags + 1

        text_page = list(
            sentenize(
                re.sub('\n', ".", (re.sub('\n+', "", strip_tags(obj.main))))))

        i = 0

        for t in text_page:
            #terms = TermExtractor()
            for term in term_extractor(t.text):

                if re.search(kw[i], term.normalized):

                    if len(kw) > i:
                        input1 = re.sub('\n+', " ", t.text)
                        input1 = re.sub('\[[0-9]*\]', "", input1)
                        input1 = re.sub('&nbsp;', " ", input1)
                        input1 = re.sub(' +', " ", input1)
                        descripshion += input1
                        descripshion_stat += descripshion + ' ( ' + kw[
                            i] + ' )  [' + str(len(kw) - i) + '] '
                        if t.text != '':
                            ttext += t.text + ' (-- ' + kw[i] + ' )\n'
                    i += 1
                    #if len(kw) > i   and  t.start > 1000 :
                    ##circle_txt(text_a[0:],W,F,i)
                    for t in text_page:
                        try:
                            if re.search(kw[i], term.normalized):
                                if len(kw) > i:
                                    descripshion += re.sub(' +', " ", t.text)
                                    descripshion_stat += descripshion + ' ( ' + kw[
                                        i] + ' )  [' + str(len(kw) - i) + '] '
                                    ttext = t.text
                                i += 1
                        except IndexError:
                            i = 1

        #obj.tags = TaggableManager(through=RuTaggedItem)
        obj.title = textwrap.shorten(re.sub('&nbsp;', "",
                                            strip_tags(obj.main)),
                                     width=150,
                                     placeholder="")
        obj.meta_description = textwrap.shorten(re.sub('&nbsp;', "",
                                                       descripshion),
                                                width=248,
                                                placeholder="")
        #obj.stattext='Всего слов: '+str(count_word)+'\n'+stattext+'\n--------------------\n'+descripshion_stat+'\n\n'+ttext
        obj.stattext='Всего слов: '+str(count_word)+'\n Титл:\n'+ \
        obj.title+'\n------------------------\nDescription:\n'+obj.meta_description+'\n-------------------- \n' +\
        stattext+'\n--------------------\n'+mytags+'\n\n'+ttext

        obj.meta_keywords = textwrap.shorten(keyword, width=248)
        #obj.save(commit=False)
        #obj.tags=mytags
        #obj.save_m2m(['tags'])
        #obj.save(update_fields=(['meta_keywords','title','meta_description','stattext']))
        obj.save(update_fields=([
            'title',
            'meta_keywords',
            'meta_description',
            'stattext',
        ]))

    self.message_user(request, "Заголовок страницы изменен " + str(count_word))
Exemplo n.º 29
0
def make_category_product(self, request, queryset):

    idf = dict()
    term_extractor = TermExtractor()

    for obj in queryset:
        #val1=strip_tags(obj.main)
        keyword = ''
        stattext = ''
        descripshion = ''
        descripshion_stat = ''
        mytags = ''
        count_tags = 0
        ttext = ''

        main = obj.description
        count_word = len(set(strip_tags(main).split()))

        kw = []

        for term in term_extractor(strip_tags(main),
                                   10,
                                   weight=lambda term: idf.get(
                                       term.normalized, 1.0) * term.count):
            stattext += term.normalized + '  :' + str(
                term.count) + ' тошнота- ' + str(
                    float('{:.2f}'.format(
                        term.count / count_word * 100 * 7))) + '%\n'
            kw.append(term.normalized)
            keyword += term.normalized + ','
            if count_tags < 5:
                mytags += term.normalized + ','
                count_tags = count_tags + 1

        text_page = list(
            sentenize(re.sub('\n', ".",
                             (re.sub('\n+', "", strip_tags(main))))))

        i = 0

        for t in text_page:
            #terms = TermExtractor()
            for term in term_extractor(t.text):

                if re.search(kw[i], term.normalized):

                    if len(kw) > i:
                        input1 = re.sub('\n+', " ", t.text)
                        input1 = re.sub('\[[0-9]*\]', "", input1)
                        input1 = re.sub('&nbsp;', " ", input1)
                        input1 = re.sub(' +', " ", input1)
                        descripshion += input1
                        descripshion_stat += descripshion + ' ( ' + kw[
                            i] + ' )  [' + str(len(kw) - i) + '] '
                        if t.text != '':
                            ttext += t.text + ' (-- ' + kw[i] + ' )\n'
                    i += 1
                    #if len(kw) > i   and  t.start > 1000 :
                    ##circle_txt(text_a[0:],W,F,i)
                    for t in text_page:
                        try:
                            if re.search(kw[i], term.normalized):
                                if len(kw) > i:
                                    descripshion += re.sub(' +', " ", t.text)
                                    descripshion_stat += descripshion + ' ( ' + kw[
                                        i] + ' )  [' + str(len(kw) - i) + '] '
                                    ttext = t.text
                                i += 1
                        except IndexError:
                            i = 1

        obj.title = textwrap.shorten(re.sub('&nbsp;', "", strip_tags(main)),
                                     width=150,
                                     placeholder="")
        obj.meta_description = textwrap.shorten(re.sub('&nbsp;', "",
                                                       descripshion),
                                                width=248,
                                                placeholder="")
        #obj.stattext='Всего слов: '+str(count_word)+'\n'+stattext+'\n--------------------\n'+descripshion_stat+'\n\n'+ttext
        obj.stattext='Всего слов: '+str(count_word)+'\n Титл:\n'+ \
        obj.title+'\n------------------------\nDescription:\n'+obj.meta_description+'\n-------------------- \n' +\
        stattext+'\n--------------------\n'+mytags+'\n\n'+ttext

        obj.meta_keywords = textwrap.shorten(keyword, width=248)

        obj.save(update_fields=(
            ['meta_keywords', 'title', 'meta_description', 'stattext']))
        #obj.save(update_fields=(['meta_keywords','meta_description','stattext',]))
        #obj.save(update_fields=(['stattext',]))

    self.message_user(request, "Заголовок страницы изменен " + str(count_word))


#def image_html_clean(body,size=''):
#soup = BeautifulSoup(str(body), "html.parser")

#whitelist = ['a','img']
#for tag in soup.find_all(True):
#if tag.name not in whitelist:
#tag.attrs = {}
#else:
#attrs = dict(tag.attrs)
#for attr in attrs:
#if attr not in ['src','href']:
#del tag.attrs[attr]

#for tag in soup.find_all('img'):
#attrs = dict(tag.attrs)
#for attr in attrs:
#if attr not in ['src','href']:
#del tag.attrs[attr]
#tag['class'] ='ui '+size+' floated image '

#return str(soup)
Exemplo n.º 30
0
import sys

max_terms = 20
#Проверка, есть ли файл в директории(Был ли он получен)
if not os.path.isfile('text.txt'):
    print('text file not exist')
    sys.exit()
if not os.path.exists('term_out'):
    os.mkdir('term_out')
print(123)
morph = pymorphy2.MorphAnalyzer()
text_file = codecs.open('text.txt',encoding = 'utf-8', mode ='r')
text = text_file.read()
text = str(text)
text_file.close()
#Формирование словаря(тезауруса)
f = codecs.open('term_out\out.csv', 'w', encoding='utf-8')
f.write(u'phrase' + ',' + 'count' + ',' + 'POS' + ',' + 'case' + ',' + 'number' + ',' + 'gender' + ',' + 'person' + ',' + 'animacy' + ',' + 'wordCount''\n')
#Извлечение ключевых слов и прочих параментров 

term_extractor = TermExtractor()
for term in term_extractor(text, max_terms):
    f.write(term.normalized + ',' + str(term.count))
    if term.word_count == 1:
        t = morph.parse(term.normalized)[0]
        f.write(u',' + str(t.tag.POS) + ',' + str(t.tag.case) + ',' + str(t.tag.number) + ',' + str(t.tag.gender) + ',' + str(t.tag.person) + ',' + str(t.tag.animacy))
    else:
        f.write(u',phrase,None,None,None,None,None')
    f.write(u',' + str(term.word_count)) #Добавление количества слов
    f.write('\n')       
f.close()