Python KeyBERT.extract_keywords示例，keybert.KeyBERT.extract_keywords Python示例

示例#1

0

显示文件

def getkeywords_key_bert(text):
    #text = getemail(name)
    stopwords = stopwordslist("stopwords.txt")
    kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
    keywords = kw_extractor.extract_keywords(text,
                                             keyphrase_ngram_range=(1, 1),
                                             stop_words=stopwords,
                                             min_df=1,
                                             use_maxsum=True,
                                             use_mmr=True)
    print("Keywords of article", keywords)
    return keywords

示例#2

0

显示文件

文件： pre_process.py 项目： txx18/github-KG

def extract_paper_keywords(input_csv, out_csv, keywordCount):
    # record_list = pd.read_csv(input_csv).to_dict(orient='records')
    list_dic = pd.read_csv(input_csv).to_dict(orient='list')
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    # res = []
    res = {}
    # for index, record in enumerate(record_list):
    keyword_list_list = []
    for index, doc in enumerate(list_dic['allTitleAndAbstract']):
        print('-' * 100)
        print("index: " + str(index + 1) + "/" +
              str(len(list_dic['nameWithOwner'])) + ", repo: " +
              str(list_dic['nameWithOwner'][index]))
        # 为了使结果多样化，我们可以使用最大余量相关性（MMR）创建也基于余弦相似度的关键字/关键词。具有高度多样性的结果：
        tuple_list = model.extract_keywords(doc,
                                            keyphrase_ngram_range=(1, 1),
                                            stop_words='english',
                                            top_n=keywordCount,
                                            use_mmr=True,
                                            diversity=0.7)
        # 拿尽量多的关键词
        tmp = keywordCount
        while len(tuple_list) == 0:
            tmp = tmp - 5
            tuple_list = model.extract_keywords(doc,
                                                keyphrase_ngram_range=(1, 1),
                                                stop_words='english',
                                                top_n=tmp,
                                                use_mmr=True,
                                                diversity=0.7)
        keyword_list_list.append([candidate[0] for candidate in tuple_list])
    pre_keyword_list_list = lemmatisation(keyword_list_list)
    res['nameWithOwner'] = list_dic['nameWithOwner']
    res['content'] = [' '.join(x) for x in pre_keyword_list_list]
    pd.DataFrame.from_dict(res, orient='columns').to_csv(out_csv, index=False)
    pass

示例#3

0

显示文件

def keybert_keyword_extractor(filename, keyphrase_range=(1, 2)):
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    extractor_data = load_as_json(filename)
    categories = dict()
    result_dict = dict()
    for content in extractor_data:
        if content['category'] not in categories:
            categories[content['category']] = []
            result_dict[content['category']] = []
        categories[content['category']].append(content['text'])
    for category, category_array in categories.items():
        print(category)
        word_set = set()
        for text_from_category in category_array:
            keywords = model.extract_keywords(
                text_from_category,
                keyphrase_ngram_range=keyphrase_range,
                stop_words='english')
            for keyword, value in keywords:
                word_set.add(keyword)
        word_array = list(word_set)
        print(len(word_array))
        result_dict[category] = word_array
    return result_dict

示例#4

0

显示文件

文件： functions.py 项目： wuwanahq/nlptagger

class NLP_Wuwana():
    def __init__(
        self,
        db,
        languages,
        weight_field,
        spacy_model,
        remove_words="./data/words_to_remove.txt",
        replace_words="./data/words_to_replace.txt",
        tags_alwaysmain="./data/finaltags_alwaysmain.txt",
        tags_toremove="./data/finaltags_toremove.txt",
        empha_words=False,
        empha_multi=1,
        desc_field="description",
        max_words=5,
    ):
        """Class defined to process wuwana description tags. It attacks db and uses 3 NLP Libraries:
        - Spacy as tokenizer.
        - Wordcloud as tag modeller.
        - Gensim as tag modeller.
        

        Parameters
        -----------
        languages: list with languages in format: ["es","fr","zh-cn"].
        remove_words: path to file of words to be removed.
        replace_words: path to file of words to be replaced.
        tags_alwaysmain: tags that will always be the main tag (just first ocurrence, in order of appeareance).
        tags_toremove: tags that will never appear.
        spacy_mode: pretrained Spacy model. 
        max_words: max words to be extracted from description texts.
        desc_field: field where text is stored in company table.
        weight_field: Field in company table where weights will be stored.
       
        """

        #file with words to be removed from tags
        self.file_words = open(remove_words, "r", encoding="utf-8")
        self.remove_words = self.file_words.read().split(";")

        #file with words to emphasize
        self.empha_multi = empha_multi
        self.empha_words = empha_words

        if (empha_words):
            self.file_empha = open(empha_words, "r", encoding="utf-8")
            self.words_to_emphasize = self.file_empha.read().split(";")

        #tags to remove
        self.file_words = open(tags_toremove, "r", encoding="utf-8")
        self.tags_toremove = self.file_words.read().split(";")

        #tags always as main
        self.file_words = open(tags_alwaysmain, "r", encoding="utf-8")
        self.tags_alwaysmain = self.file_words.read().split(";")

        #bag of words that should be replaced, such as abbreviations
        with open(replace_words, "r", encoding="utf-8") as f_in:
            self.replace_words = json.load(f_in)

        self.translator = google_translator()
        self.db = db
        self.cursor_tag = self.db.cursor()
        self.max_words = max_words
        self.desc_field = desc_field
        self.languages = languages
        self.weight_field = weight_field

        # English pretrained Spacy model
        try:
            self.nlp = spacy.load("en_core_web_lg")
        except:
            sys.exit(
                "ERROR: You must download en_core_web_lg spacy model. Use 'python -m spacy download en_core_web_lg' "
            )

    ########
    #MAIN##
    ########

    def process_query_companies(self, lib, onlyid=False, column_pos=1):
        """ Function that launch a sql query and extract main tags from column
        
        Parameters
        -----------
        lib: NLP Lib to use (Gensim or Wordcloud)
        onlyid: Check if changes only applied to one company id
        column_pos: position of the first column to extract text
        
        """

        if (onlyid):
            query = "select company.ID, company." + str(
                self.desc_field) + " from company where ID = '" + str(
                    onlyid) + "'"
        else:
            query = "select company.ID, company." + str(
                self.desc_field) + " from company "

        if ((lib != "gensim") & (lib != "wordcloud") & (lib != "keybert")):
            sys.exit("ERROR: Unknown library: " + str(lib))
        else:
            pass

        if (lib == "keybert"):
            self.model = KeyBERT('distilbert-base-nli-mean-tokens')

        updates_list = []

        try:
            # Execute the SQL command
            cursor = self.db.cursor()
            result = cursor.execute(query)
            self.db.commit()
        except Exception as e:
            print("ERROR LOADING DB ", str(e))
            pass

        n = 0
        if (result):
            rows = cursor.fetchall()
            print("Processing " + str(result) + " companies.")

            for row in rows:
                try:
                    text = row[column_pos]
                    nouns_ex = self.process_text(text, lib)
                    tags_english = self.get_keywords(nouns_ex,
                                                     self.max_words,
                                                     lib=lib)

                    if (tags_english):

                        tags_main = dict()
                        tags_all = dict()

                        tags_english_split = tags_english[1].split(";")

                        #remove predefined tags. insert main tags. (from file)

                        tags_english_split = self.remove_finaltags(
                            tags_english_split)
                        tags_english_split = self.put_maintags(
                            tags_english_split)

                        for l in self.languages:

                            tags_main[l] = self.get_first_text(
                                self.get_translation(tags_english[0], lang=l))
                            tags_all[l] = self.get_first_text(
                                self.get_translation(tags_english[1], lang=l))

                            tag_list = []

                        main_tag = False
                        second_tag = False
                        other_tags = False

                        for x in range(0, len(tags_english_split)):

                            if (len(tags_english_split[x]) > 0):
                                for s in tags_all.keys():
                                    try:
                                        tag_split = tags_all[s].split(
                                            ";")[x].strip()
                                    except:
                                        tag_split = "-"

                                    tag_list.append(tag_split)

                                self.check_and_insert_tag(
                                    tags_english_split[x], tag_list)
                                tag_list = []

                            if (x == 0):
                                main_tag = tags_english_split[x].strip()
                            elif (x == 1):
                                second_tag = tags_english_split[x].strip()
                            elif (x == 2):
                                other_tags = tags_english_split[x].strip()
                            else:
                                other_tags += ";" + tags_english_split[
                                    x].strip()

                        updates_list.append(
                            self.update_company_tags(main_tag, row[0],
                                                     tags_english[2],
                                                     second_tag, other_tags))

                    else:
                        print("WARNING: No tags extracted for ID", row[0],
                              "with text:", text)

                except Exception as e:
                    print("ERROR Processing query row: ", str(e))

        else:
            print("WARNING: NO rows for that ID.")

        #update tags - execute querys
        for i in updates_list:
            try:
                cursor.execute(i)
                self.db.commit()
            except Exception as e:
                print("Error ", str(e))

    def process_text(self, text, lib):
        """ Function that processes a text with a pipeline of tasks, and returns transformed and cleaned text 
        to be used by NLP libs
        
        Parameters
        -----------
        text:  text to extract tags
        lib: NLP library to be used afterwards (wordcloud or gensim)
        return: cleaned and transformed text
        """

        #remove hastags, mentions, and links. Comment this line to let hastags and metions appear.
        text = self.strip_all_entities(self.strip_links(text))
        #remove special chars.
        text = self.remove_special_characters(text)
        #remove emojis.
        text = self.remove_emojis(text)
        #detect source lang and translate to english if necessary.
        source_lang = self.detect_lang(text)

        #print("ORIG TEXT:", text)

        if source_lang:
            if source_lang != 'en':
                text = self.get_translation(text)

        else:
            print(
                "WARNING: No specific language detected. Translating sentences (slow)"
            )
            text = self.translate_sentence_by_sentence(text)

        #to lowercase.
        text = text.lower()

        #emphasize words if required. It repeats certain words in text (from file).
        if (self.empha_words):
            text = self.emphasize_words(text)

        # Spacy model and custom tokenizer
        self.nlp.tokenizer = self.custom_tokenizer()
        sentence = ''
        # Extract sentences
        text_lines = text.split(".")

        if (lib == "wordcloud"):

            #get nouns longer than 1 char
            for word in self.nlp(text):
                if ((word.pos_ in ['NOUN']) & (len(word.text) > 1)):
                    sentence += word.text + ' '
            #replace some words with others
            sentence = self.replace_dict(sentence)
            #remove specific words and lemmatize
            sentence = self.remove_common(sentence)
            #and lemmatize
            sentence = self.lemmatize(sentence)
            #last nouns filter
            fin_sent = ''
            for word in self.nlp(sentence):
                if word.pos_ in ["NOUN"]:
                    fin_sent += word.text + ' '

        elif ((lib == "gensim")):

            #get nouns and adjetives longer than 1 char
            for word in self.nlp(text):
                if ((word.pos_ in ["NOUN", "ADJ"]) & (len(word.text) > 1)):
                    sentence += word.text + ' '

            #replace some words with others
            sentence = self.replace_dict(sentence)
            #remove specific words
            sentence = self.remove_common(sentence)
            #and lemmatize
            sentence = self.lemmatize(sentence)
            fin_sent = sentence

        elif ((lib == "keybert")):

            new_lines = []

            for line in text_lines:
                new_line = []
                #get nouns and adjetives longer than 1 char
                for word in self.nlp(line):
                    if ((word.pos_ in ["NOUN", "ADJ"]) & (len(word.text) > 1)):
                        new_line.append(word.text)

                new_lines.append(" ".join(new_line))

            sentence = ". ".join(new_lines)

            #replace some words with others
            sentence = self.replace_dict(sentence)
            #remove specific words
            sentence = self.remove_common(sentence)
            #and lemmatize
            sentence = self.lemmatize(sentence)
            fin_sent = sentence
            #print("SENTENCE:",sentence)

        else:
            sys.exit("ERROR: LIB NOT FOUND: " + str(lib))

        return fin_sent

    ########
    #MYSQL##
    ########

    def update_company_tags(self,
                            first_tag,
                            idcomp,
                            weights,
                            second_tag=False,
                            other_tags=False):
        """Creates SQL Query for update the tag table
        Parameters
        -----------
        first_tag:  main tag
        idcomp: id of company 
        weights: weights of every tag
        second_tag: the second tag most relevant
        other_tags: rest of tags
        
        return: sql query to update
        
        """

        weights = self.get_weight_string(weights)

        print("\nID:", idcomp, "\nFIRST:", first_tag, "\nSECOND:", second_tag,
              "\nOTHERS:", other_tags, "\nWEIGHTS:", weights)

        if (other_tags):
            sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='{1}', OtherTags = '{2}', {5} = '{4}'  where ID = {3}".format(
                first_tag, second_tag, other_tags, idcomp, weights,
                self.weight_field)
        elif (second_tag):
            sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='{1}', OtherTags = '', {4} = '{3}'  where ID = {2}".format(
                first_tag, second_tag, idcomp, weights, self.weight_field)
        else:
            sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='', OtherTags = '', {3} = '{2}' where ID = {1}".format(
                first_tag, idcomp, weights, self.weight_field)

        return (sql_upd)

    def check_and_insert_tag(self, eng_tag, tags):
        """Checks if tag exists in table tag and creates if not

        Parameters
        -----------
        eng_tag: Tag in english
        tags: Rest languages tags
        return: main tag

        """

        tag_compo = ""

        for i in tags:
            tag_compo += i + ";"

        try:
            sql_tag = "Select * from tag where ID = '{0}'".format(eng_tag)
            count = self.cursor_tag.execute(sql_tag)

            if (count == 0):  #not exists
                sql_tag = "Insert into tag (ID, Names) values ('{0}', '{1}') ".format(
                    eng_tag.lower().replace("'", ""),
                    tag_compo.lower().replace("'", ""))
                self.cursor_tag.execute(sql_tag)
                self.db.commit()

            return eng_tag

        except Exception as e:
            print("ERROR: check_and_insert_tag ", str(e))

    ##########
    ###NLP####
    ##########

    def detect_lang(self, text):
        """ Function that detects the language of a text
        
        Parameters
        -----------
        text:  Text to be detected
        return: lang detected
        """

        try:
            lang = self.translator.detect(text)[0]
            return lang
        except:
            print("WARNING: No language detected in text")
            return False

    def get_translation(self, text, lang="en"):
        """ Function that translate text to english
        
        Parameters
        -----------
        text:  Text to be translated
        return: translated text
        """

        max_len = 4900  #library limit 5000

        if (len(text) > max_len):

            sub_text = ""
            for i in range(0, math.ceil(len(text) / max_len)):
                start = i * max_len
                end = (i + 1) * (max_len)
                sub_text += text[
                    start:
                    end]  #translator.translate(text[start:end], lang_tgt='en')

            text = sub_text
        else:

            text = self.translator.translate(text, lang_tgt=lang)

            if (isinstance(text, list)):
                text = text[0].replace(",", ";")
            else:
                text = text.replace(",", ";")

        time.sleep(0.5)  #1 second delay in order to avoid ip blocking
        return text

    def translate_sentence_by_sentence(self, text):
        """ Function that translate sentece by sentence a string to english. Separated by '.'
        
        Parameters
        -----------
        text:  Text to be translated
        return: translated text
        """

        sub_text = ""
        sentences = text.split(".")

        for s in sentences:
            sub_text += self.translator.translate(s, lang_tgt='en')

        return sub_text

    def replace_dict(self, sentence):
        """ Function that replace words in a sentence according to a dictionary or words (replace_words)
        
        Parameters
        -----------
        sentence:  Text to be modified
        return: cleaned text
        """

        sentence = sentence.lower()  # convert to lower case

        for word, abbr in self.replace_words.items():
            sentence = sentence.replace(word.lower(), abbr)
        return sentence

    def remove_common(self, sentence):
        """ Function that remove words in a sentence according to a dictionary or words (remove_words)
        
        Parameters
        -----------
        sentence:  Text to be modified
        return: cleaned text
        """

        final_sentence = ""

        stops = [" ", ".", ",", "-", ";"]

        # common_words to remove

        for word in sentence.split(" "):
            tmp = word.lower()
            for i in stops:
                tmp = tmp.replace(i, "")
            if tmp not in self.remove_words:
                final_sentence += word.lower() + " "

        return final_sentence

    def lemmatize(self, sentence):
        """ Function that extract lemmas from sentence
        
        Parameters
        -----------
        sentence:  Text to be analysed
        return: transformed text
        """

        self.nlp.tokenizer = self.custom_tokenizer()
        final_sentence = ''

        # common_words to remove
        for word in self.nlp(sentence):
            final_sentence += word.lemma_.lower() + ' '
        return final_sentence

    def get_weight_string(self, weights):
        """ Function that transform weight object to string.
        
        Parameters
        -----------
        weights:  Weight object returned by nlp
        return: weight transformed to string
        """

        if (isinstance(weights, dict)):  #gensim
            weights = json.dumps(weights).replace("'", "")
        elif (isinstance(weights, list)):  #wordcloud
            weights = ', '.join(str(e).replace(",", ":")
                                for e in weights).replace("'", '"').replace(
                                    "(", '').replace(")", '')
            weights = "{" + weights + "}"

        return weights

    def custom_tokenizer(self):
        """ Function that defines a tokenizer in order to be used
        
        Parameters
        -----------
        nlp:  spacy loaded object
        return: prepared tokenizer
        """

        infixes = (
            LIST_ELLIPSES + LIST_ICONS + [
                r"(?<=[0-9])[+\-\*^](?=[0-9-])",
                r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                    al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
                r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
                #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
                r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
            ])

        infix_re = compile_infix_regex(infixes)

        return Tokenizer(self.nlp.vocab,
                         prefix_search=self.nlp.tokenizer.prefix_search,
                         suffix_search=self.nlp.tokenizer.suffix_search,
                         infix_finditer=infix_re.finditer,
                         token_match=self.nlp.tokenizer.token_match,
                         rules=self.nlp.Defaults.tokenizer_exceptions)

    def remove_special_characters(self, text):
        """ Function that removes special characters from a text
        
        Parameters
        -----------
        text:  text to be modified
        return: cleaned text
        """
        bad_chars = [';', ':', '!', "*", "¿", "?", "¡"]

        for i in bad_chars:
            text = text.replace(i, ' ')

        return text

    def remove_emojis(self, text):
        """ Function that removes emojis from a text
        
        Parameters
        -----------
        text:  text to be modified
        return: cleaned text
        """

        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+",
            flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    def emphasize_words(self, text):
        """ Function that repeats emphasize_words if found in text
            text: text to find words and modify.        
        """

        for i in self.words_to_emphasize:
            if i.lower() in text.lower():
                if (len(i) > 0):
                    for x in range(0, self.empha_multi):
                        text += ". " + i.lower()

        return text

    def get_keywords(self, words, amount=3, lib="wordcloud", sep=";"):
        """ Function that extract main keywords from processed text
        
        Parameters
        -----------
        words:  bag of words to extract tags
        amount: amount of of words to be extracted. 3 max words for gensim
        lib: lib to be used - gensim, wordcloud, keybert
        sep: separator for returned words 
        return: main tag, list with all tags, weighted tags
        
        """
        if (len(words) > 0):
            if (lib == "gensim"):

                tmp = keywords(words, words=min(amount, 3), split=True)
                info = keywords(words, words=min(amount, 3), scores=True)

                if (tmp):
                    return tmp[0], sep.join(tmp), info
                else:
                    return False
            elif (lib == "wordcloud"):
                listw = ""
                wcloud = wordcloud.WordCloud().generate(words)
                n = 0
                if (wcloud.words_):
                    for i in wcloud.words_:
                        if (n == 0):
                            main = i
                            listw += i + sep
                        else:
                            if (n < amount):
                                listw += i + sep
                        n += 1

                    return main, listw, wcloud.words_
                else:
                    return False

            elif (lib == "keybert"):

                tags = self.model.extract_keywords(words,
                                                   keyphrase_ngram_range=(0,
                                                                          2),
                                                   stop_words='english',
                                                   use_mmr=True,
                                                   diversity=0.2,
                                                   top_n=amount)
                if (len(tags) > 0):
                    return tags[0], sep.join(tags), ""
                else:
                    return "", "", ""

        else:
            #print("Warning: No words to extract tags: ", words)
            return False

    def strip_links(self, text):
        """ Removes urls from text
        
        Parameters
        -----------
        text: String to remove urls
        return: cleaned text
        
        """

        link_regex = re.compile(
            '((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)',
            re.DOTALL)
        links = re.findall(link_regex, text)
        for link in links:
            text = text.replace(link[0], ', ')
        return text

    def strip_all_entities(self, text):
        """ Removes rrss hastags and mentions from text
        
        Parameters
        -----------
        text: String to remove hastags  
        return: cleaned text      
        
        """

        entity_prefixes = ['@', '#']
        for separator in string.punctuation:
            if separator not in entity_prefixes:
                text = text.replace(separator, ' ')
        words = []
        for word in text.split():
            word = word.strip()
            if word:
                if word[0] not in entity_prefixes:
                    words.append(word)
        return ' '.join(words)

    def get_first_text(self, obj):
        """Extracts from abn objet:
        - first occurrence if array
        - text if string

        Parameters
        -----------
        obj: object to extract text (array or str)
        return: first ocurrence
       
        """

        if (isinstance(obj, list)):
            if (isinstance(obj[0], list)):
                obj[0][0].strip()

            else:
                return obj[0].strip()
        else:
            return obj.strip()

    def remove_finaltags(self, tags):
        """Remove tags from final processing

        Parameters
        -----------
        tags: list to be cleaned
        return: cleaned tag list
       
        """

        tmp_list = []
        for i in tags:
            if i not in self.tags_toremove:
                tmp_list.append(i)
        return tmp_list

    def put_maintags(self, tags):
        """Pririze some tags as main tag

        Parameters
        -----------
        tags: list to be modified
        return: modified tag list
       
        """

        for i in self.tags_alwaysmain:
            if (i in tags):
                pos = (tags.index(i))
                tmp = tags[0]
                tags[pos] = tmp
                tags[0] = i
                return tags

        return tags

示例#5

0

显示文件

文件： keybert.py 项目： RubensZimbres/Repo-2021

from keybert import KeyBERT

doc = """O aprendizado automático (português brasileiro) ou a aprendizagem automática (português europeu) ou também aprendizado de máquina (português brasileiro) ou aprendizagem de máquina (português europeu) (em inglês: machine learning) é um subcampo da Engenharia e da ciência da computação que evoluiu do estudo de reconhecimento de padrões e da teoria do aprendizado computacional em inteligência artificial[1]. Em 1959, Arthur Samuel definiu aprendizado de máquina como o "campo de estudo que dá aos computadores a habilidade de aprender sem serem explicitamente programados"[2](livre tradução). O aprendizado automático explora o estudo e construção de algoritmos que podem aprender de seus erros e fazer previsões sobre dados[3]. Tais algoritmos operam construindo um modelo a partir de inputs amostrais a fim de fazer previsões ou decisões guiadas pelos dados ao invés de simplesmente seguindo inflexíveis e estáticas instruções programadas. Enquanto que na inteligência artificial existem dois tipos de raciocínio (o indutivo, que extrai regras e padrões de grandes conjuntos de dados, e o dedutivo), o aprendizado de máquina só se preocupa com o indutivo."""

#paraphrase-xlm-r-multilingual-v1
#bert-base-multilingual-cased

model = KeyBERT('bert-base-multilingual-cased')
keywords = model.extract_keywords(doc)

model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None)

model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None)

示例#6

0

显示文件

文件： words_search.py 项目： as1mple/Search_terminology_words

    if el == 'error':
        dict_.update({key: [value]})
    else:
        el.append(value)


def common_words(key_words: list) -> list:
    return [el for el in key_words if el in data]


with open(PATH_TO_TEXT, 'r') as code_book:
    for sent_index, sent in enumerate(sent_tokenize(code_book.read())):

        try:
            n1 = model.extract_keywords(sent.lower(),
                                        keyphrase_length=1,
                                        use_maxsum=True)
            n2 = model.extract_keywords(sent.lower(),
                                        keyphrase_length=2,
                                        use_maxsum=True)
            key_words = key_words.union(set(n1 + n2))

        except Exception as e:
            e = e

        main(sent, sent_index)

with open(PATH_TO_SAVE, 'w') as fp:
    json.dump(result, fp)

bert_words = common_words(key_words=key_words)

示例#7

0

显示文件

# %%
start = time.time()
logging.info('Getting Happy Elements...')

query_hm = args.query_hm
top_n = args.num_he
df = pd.read_csv(args.hm_file)
df = df[df['cleaned_up_hm'] != query_hm].sample(args.num_hms)

all_hms = [query_hm, *list(df['cleaned_up_hm'].values)]
df = pd.DataFrame({'happy_moment': all_hms})

happy_elements = kb_model.extract_keywords(all_hms,
                                           keyphrase_ngram_range=(1, 2),
                                           stop_words='english',
                                           use_mmr=True,
                                           diversity=1,
                                           top_n=1)

input_events = [he[0] for hes in happy_elements for he in hes]
df['happy_element'] = input_events

logging.info('Total Time Used: ' + str(time.time() - start) + "s")

# %% [markdown]
# input happy elements to conceptnet and get the outputs

# %%
relation = ['AtLocation', 'HasPrerequisite', 'Desires', 'UsedFor']
sampling_algorithm = args.sampling_algorithm

示例#8

0

显示文件

文件： get_candidates_keybert.py 项目： cstenkamp/derive_conceptualspaces

class KeyBertExtractor():
    """https://github.com/MaartenGr/KeyBERT"""

    #TODO there really are many many configs and I think changing these changes a great deal! see https://github.com/MaartenGr/KeyBERT and try out stuff!!
    #TODO there is a minimum-frequency-argument!! https://github.com/MaartenGr/KeyBERT/blob/master/keybert/_model.py#L83-L101
    #TODO does this use the phrase_in_text function? SHOULD IT?

    def __init__(self, is_multilan, faster=False, max_ngram=1):
        """available models: https://github.com/MaartenGr/KeyBERT#25-embedding-models"""
        from keybert import KeyBERT  #lazily loaded as it needs tensorflow which takes some time to init
        assert not (is_multilan and faster)
        if faster:
            self.model_name = "paraphrase-MiniLM-L6-v2"
        elif is_multilan:
            self.model_name = "paraphrase-multilingual-MiniLM-L12-v2"
        else:
            self.model_name = "paraphrase-mpnet-base-v2"
        print(f"Using model {self.model_name}")
        self.kw_model = KeyBERT(self.model_name)
        self.max_ngram = max_ngram

    def _fix_hyphenated(self, cand, comparedtext):
        # it may be the case that the candiate is something like "particle systems", however the text only has "many-particle systems".
        # if so, then `(not phrase_in_text(cand, without_stops)) and cand in without_stops == True`
        words_before_onset = comparedtext[:comparedtext.find(cand)].count(" ")
        chars_before_onset = len(" ".join(
            comparedtext.split(" ")[:words_before_onset]))
        if chars_before_onset > 0 and chars_before_onset + 1 != comparedtext.find(
                cand):
            # then the first word is hyphenated
            return comparedtext[chars_before_onset +
                                1:comparedtext.find(cand) + len(cand)]
        elif words_before_onset == 0 and bool(
                re.fullmatch(WORD_NUM_REGEX,
                             comparedtext[:comparedtext.find(cand)])):
            return comparedtext[:comparedtext.find(cand) + len(cand)]
        else:
            # then not the first word is hyphenated, but the last
            chars_after_hyphen = comparedtext[comparedtext.find(cand) +
                                              len(cand):].find(" ")
            if chars_after_hyphen > 0:
                return comparedtext[comparedtext.
                                    find(cand):comparedtext.find(cand) +
                                    len(cand) + chars_after_hyphen]
            elif re.fullmatch(
                    WORD_NUM_REGEX,
                    comparedtext[comparedtext.find(cand) + len(cand):]):
                return comparedtext[comparedtext.find(cand):]
        print("hm?!")
        return "NOPE"

    def extract_candidate(self, cand, text, without_stops, inds_without_stops,
                          only_words, inds_only_words):
        #TODO not sure if this version can also correct hyphenated stuff like the old one ARGH!!

        if (not phrase_in_text(cand, without_stops)) and cand in without_stops:
            cand = self._fix_hyphenated(cand, without_stops)
            if phrase_in_text(cand, text):  #maybe we're already done here
                return cand
            #now the cand is fixed and you can continue to checking phrase_in_text

        if phrase_in_text(cand, without_stops):
            tokenized_with_stops = tokenize_text(text, stopwords=None)[1]
            startpos = without_stops.find(cand)
            start_ind = without_stops[:startpos].count(" ")
            stoppos = startpos + len(cand)
            stop_ind = start_ind + without_stops[startpos:stoppos].count(" ")
            actual_phrase = " ".join(tokenized_with_stops[
                inds_without_stops[start_ind]:inds_without_stops[stop_ind] +
                1])
            if phrase_in_text(actual_phrase, text):
                if actual_phrase.split(" ")[0] == cand.split(
                        " ")[0] and actual_phrase.split(" ")[-1] == cand.split(
                            " ")[-1]:
                    # print(f"FROM {cand} TO {actual_phrase}")
                    return actual_phrase
                else:
                    print()
                    return
            print()
            return

        if (not phrase_in_text(cand, only_words)) and cand in only_words:
            cand = self._fix_hyphenated(cand, only_words)
            #now the cand is fixed and you can continue to checking phrase_in_text

        if phrase_in_text(cand, only_words):
            tokenized_with_stops = tokenize_text(text, stopwords=None)[1]
            startpos = only_words.find(cand)
            start_ind = only_words[:startpos].count(" ")
            stoppos = startpos + len(cand)
            stop_ind = start_ind + only_words[startpos:stoppos].count(" ")
            actual_phrase = " ".join(tokenized_with_stops[
                inds_only_words[start_ind]:inds_only_words[stop_ind] + 1])
            if any(
                    i in actual_phrase[:-1] for i in list("?!") + ['"']
            ):  #if the phrase is not an actual phrase but split by punctuation
                print(
                    f"{cand} is not an actual phrase - in the text it is `{actual_phrase}`"
                )
                return None
            if phrase_in_text(actual_phrase, text):
                if actual_phrase.split(" ")[0] == cand.split(
                        " ")[0] and actual_phrase.split(" ")[-1] == cand.split(
                            " ")[-1]:
                    # print(f"FROM {cand} TO {actual_phrase}")
                    return actual_phrase
                else:
                    print()
                    return
            print()
            return

        if cand in without_stops:
            print("In without_stops")
            return

        if cand in only_words:
            print("in only_words")
            return

        #another thing: cand is "internship self organization", but in the text it's "internship self-organization". Maybe remove everything but letters and then re-apply?
        c2 = re.sub(re.compile(r'[\W\d]', re.U), "|", cand)
        t2 = re.sub(re.compile(r'[\W\d]', re.U), "|", text).lower()
        if c2 in t2:
            cand = text[t2.find(c2):t2.find(c2) + len(c2)]
            if phrase_in_text(cand, text):
                return cand
            else:
                print("whatever.")
        w2 = re.sub(re.compile(r'[\W\d]', re.U), "|", without_stops)
        if c2 in w2:
            cand = without_stops[w2.find(c2):w2.find(c2) + len(c2)]
            return self.extract_candidate(cand, text, without_stops,
                                          inds_without_stops, only_words,
                                          inds_only_words)
        o2 = re.sub(re.compile(r'[\W\d]', re.U), "|", only_words)
        if c2 in o2:
            cand = only_words[o2.find(c2):o2.find(c2) + len(c2)]
            return self.extract_candidate(cand, text, without_stops,
                                          inds_without_stops, only_words,
                                          inds_only_words)

        print(f"This does not work: {cand}")

    def __call__(self, text, lang="en"):  #TODO lang shouldn't be en!!!
        """see scripts/notebooks/proof_of_concept/proofofconcept_keyBERT.ipynb for why this is like this"""
        #TODO so extract_keywords can be passed a `vectorizer`, and that is by default Sklearn's CountVectorizer.
        # You can ALSO pass `candidates`, "to use instead of extracting them from the document(s)"!!!
        # Put a breakpoint in /home/chris/.local/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:395 for details
        # TODO also why do I get this ^ warning ("Your stop_words may be inconsistent with your preprocessing") ??
        #    Does KeyBERT need already preprocessed descriptions?! if so, how much preprocessed, and how do I know this??!

        stopwords = get_stopwords(lang)
        candidates = set()
        for nwords in range(1, self.max_ngram):
            n_candidates = self.kw_model.extract_keywords(
                text, keyphrase_ngram_range=(1, nwords), stop_words=stopwords)
            candidates |= set(i[0] for i in n_candidates)
        candidates = list(candidates)

        #TODO: what if there are special chars in the candidates? is everything ok then with the word-splitting?
        #TODO does this work for numbers?!
        inds_without_stops, without_stops = tokenize_text(text, stopwords)
        ind_word_list = [
            (ind, word) for ind, word in zip(inds_without_stops, without_stops)
            if WORD_NUM_REGEX.fullmatch(word)
        ]
        inds_only_words, only_words = list(zip(
            *ind_word_list)) if ind_word_list else ([], [])
        without_stops = " ".join(without_stops)
        only_words = " ".join(only_words)
        actual_keyphrases = []
        used_candidates = []
        n_immediateworking = n_fixed = n_errs = 0
        for cand in candidates:
            # if not all(WORD_REGEX.fullmatch(i) for i in cand.split(" ")):
            #     print(f"The candidate `{cand}` is not purely textual!")

            if phrase_in_text(cand, text):
                actual_keyphrases.append(cand)
                used_candidates.append(cand)
                n_immediateworking += 1
            else:
                intextcand = self.extract_candidate(cand, text, without_stops,
                                                    inds_without_stops,
                                                    only_words,
                                                    inds_only_words)
                #TODO wenn in candidate ne zahl oder so ist die entfernen und es neu versuchen

                if intextcand:
                    if phrase_in_text(intextcand, text):
                        actual_keyphrases.append(intextcand)
                        used_candidates.append(cand)
                        n_fixed += 1
                        continue
                    else:
                        print(
                            "The extracted candidate is STILL not in the text!"
                        )
                n_errs += 1

        return actual_keyphrases, used_candidates, (n_immediateworking,
                                                    n_fixed, n_errs)

示例#9

0

显示文件

 def train(self, documents, **kwargs):
     extractor = KeyBERT('distilbert-base-nli-mean-tokens')
     stop_words = kwargs.get('stop_words', 'english')
     self.the_total_keywords = extractor.extract_keywords(
         ' '.join(documents),
         keyphrase_ngram_range=(1, 5))[:self.total_keywords_in_training]

示例#10

0

显示文件

        t_keys = topic_rank_kw_extraction(
            os.path.join(TEMP_DIR, "news_temp.txt"), news)
        t_kw_freq = nr_keywords_in_text(t_keys, news, news)

        tw_keys = topic_rank_kw_extraction(
            os.path.join(TEMP_DIR, "news_temp.txt"), tweets)
        tw_kw_freq = nr_keywords_in_text(tw_keys, news, tweets)

        evaluate_baseline(t_kw_freq,
                          tw_kw_freq,
                          fill_results_dict=True,
                          base_name="TopicRank")

        # Pre-trained KeyBert
        topic_keywords = key_bert.extract_keywords(news,
                                                   top_n=TOP_N_KEYWORDS,
                                                   keyphrase_ngram_range=(1,
                                                                          2))
        t_keys = [kw for kw, _ in topic_keywords]
        t_kw_freq = nr_keywords_in_text(t_keys, news, news)

        tweets_keywords = key_bert.extract_keywords(tweets,
                                                    top_n=TOP_N_KEYWORDS,
                                                    keyphrase_ngram_range=(1,
                                                                           2))
        tw_keys = [kw for kw, score in tweets_keywords]
        tw_kw_freq = nr_keywords_in_text(tw_keys, news, tweets)

        evaluate_baseline(t_kw_freq,
                          tw_kw_freq,
                          fill_results_dict=True,
                          base_name="KeyBert")

示例#11

0

显示文件

 def set_keyword_score_list(self, **kwargs):
     extractor = KeyBERT('distilbert-base-nli-mean-tokens')
     stop_words = kwargs.get('stop_words', 'english')
     self._keyword_score_list = extractor.extract_keywords(self._document, keyphrase_ngram_range=(1, 4), stop_words=stop_words)[:len(self._document)]

示例#12

0

显示文件

        d = json.loads(line)
        texts.append("\n ".join([porterStem(p)
                                 for p in d['targetParagraphs']]))
        titles.append(d['targetTitle'])
        labels.append(dictLabels[d['id']])
        index += 1
        if (index == 10):
            break

sim = scipy.spatial.distance.cosine
n = 2
k = len(titles[n].split())

print(titles[n], "\nClickbait average: ", labels[n])

keyWords = " ".join(model.extract_keywords(texts[n], top_n=k))
u = vectorizer.encode(titles[n])
v = vectorizer.encode(keyWords)
print(keyWords, sim(u, v))

keyWords = " ".join(model.extract_keywords(texts[n], use_maxsum=True, top_n=k))
v = vectorizer.encode(keyWords)
print(keyWords, sim(u, v))

keyWords = " ".join(
    model.extract_keywords(texts[n], use_mmr=True, diversity=0.2, top_n=k))
v = vectorizer.encode(keyWords)
print(keyWords, sim(u, v))

keyWords = " ".join(
    model.extract_keywords(texts[n], use_mmr=True, use_maxsum=True, top_n=k))