示例#1
0
def createListOfFrequentTerms(df, column, max_ngram_size=2, numOfKeywords=40):
    concat_string = ''

    for val in df[column]:
        if is_nan(val) or val == None or val == "":
            pass
        else:
            concat_string = concat_string + ', ' + str(val)
    stop_words = []

    kw_extractor = yake.KeywordExtractor()

    # text = """spaCy is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython. The library is published under the MIT license and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""
    text = concat_string
    language = "en"
    deduplication_threshold = 0.9
    custom_kw_extractor = yake.KeywordExtractor(
        lan=language,
        n=max_ngram_size,
        dedupLim=deduplication_threshold,
        top=numOfKeywords,
        features=None)
    keywords = custom_kw_extractor.extract_keywords(text)

    sorted_list = []
    for kw in keywords:
        sorted_list.append(kw[0])
        print(kw)
    return sorted_list
 def extract_keywords(self):
     content = self.get_cleantext(self.scrape_submission())
     kwextractor1 = yake.KeywordExtractor(n=1)
     keywords1 = kwextractor1.extract_keywords(content)
     kwextractor2 = yake.KeywordExtractor(n=2)
     keywords2 = kwextractor2.extract_keywords(content)
     final = keywords1[:5] + keywords2[:5]
     return final
示例#3
0
def extractKeyWords(transcript):
    kw_extractor = yake.KeywordExtractor()
    language = "en"
    max_ngram_size = 3
    deduplication_threshold = 0.9
    numOfKeywords = 20
    custom_kw_extractor = yake.KeywordExtractor(
        lan=language,
        n=max_ngram_size,
        dedupLim=deduplication_threshold,
        top=numOfKeywords,
        features=None)
    keywords = custom_kw_extractor.extract_keywords(transcript)
    return keywords
示例#4
0
def getKeywords(text):
    #text= '''Essentially Marvel's flagship TV show, Agents of S.H.I.E.L.D. launched in 2013 and has already been renewed for an (abbreviated) seventh season for 2019-2020. Clark Gregg starred as Agent Coulson, with a two-season-long plot exploring the mystery of just how Coulson was resurrected after his death in The Avengers. Little by little, though, Chloe Bennet has become the series star, S.H.I.E.L.D.'s very own superhero, an Inhuman with the potential to literally tear the Earth apart. The rest of the cast is stellar, and each character has the kind of nuance and depth that's only possible when an actor really inhabits their role. The show's greatest strength is the fact that it can essentially be anything it wants to be; S.H.I.E.L.D. can plunge into a supernatural thriller alongside a new version of Ghost Rider, or be trapped in a dystopian future timeline in a hard sci-fi plot. Over the years, it's developed a mythology all of its own, one that allows it to stand a little bit more separate to the movies nowadays.'''
    language = "en"
    max_ngram_size = 3
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = 5

    custom_kw_extractor = yake.KeywordExtractor(
        lan=language,
        n=max_ngram_size,
        dedupLim=deduplication_thresold,
        dedupFunc=deduplication_algo,
        windowsSize=windowSize,
        top=numOfKeywords,
        features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    output = []
    for key in keywords:
        output.append(key[0])
    # th = TextHighlighter(max_ngram_size = 3, highlight_pre = "<span class='my_class' >", highlight_post= "</span>")
    # th.highlight(text, keywords)
    #
    # for kw in keywords:
    #     print(kw)
    return output
示例#5
0
def searchKeywords(startString, keywordSearch, language, topNumber):

    ignored = set(["conseil communal", "conseil général"])
    kw_extractor = yake.KeywordExtractor(lan=language, top=topNumber)

    data_path = os.path.dirname(os.path.abspath(__file__))+ f"\\..\\rapport\\data\\txt\\"
    files = os.listdir(data_path)

    kwResult = []
    for f in sorted(files):
        if f.startswith(startString):
            print("try open" + f)
            text = open(data_path + f, encoding="latin-1").read()
            try:
                keywords = kw_extractor.extract_keywords(text)
                kept = []
                for score, kw in keywords:
                    words = kw.split()
                    if len(words) > 0 and kw not in ignored:
                        kept.append(kw)
                for k in kept:
                    for w in keywordSearch:
                        print(k + " - " + w)
                        if k.find(w) > -1:
                            kwResult.append(f)
                            print("add "+ f)
            except Exception as ex:
                print("Impossible to extract keyword in " + f + " file:")
                print(ex)
                pass
            #print(f"{f} mentions these keywords: {', '.join(kept)}...")
    
    print(len(kwResult))
    return kwResult
示例#6
0
def filterKeywords(results, raw_query):
    """
    filter for keywords by checking if any of the query keywords are in the article titles
    Function is hardwired to the search query formatting currently used
    """
    actual_query = raw_query.partition('(')[
        0]  # this contains the actual query in string format
    words = actual_query.split()  # this contains the query in list format

    if (len(words) >= 4):
        #keyword analysis. DISCLAIMER: is slowwwww
        kw_extractor = yake.KeywordExtractor()
        keywords = kw_extractor.extract_keywords(actual_query)
        for kw in keywords:
            if ((len(kw[0].split())) == 1):
                words.append(kw[0])
            # else skip
    for i in results:
        pass_fail = False
        for word in words:
            contained = False  # initialization
            if (word.lower() in (i.get('name')).lower()):
                contained = True
            else:
                contained = False
            pass_fail = pass_fail or contained
        if (pass_fail == False):
            results.remove(i)
    return results
示例#7
0
def getanalysis(df):
    lis = []
    language = "en"
    max_ngram_size = 1
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = 2

    custom_kw_extractor = yake.KeywordExtractor(
        lan=language,
        n=max_ngram_size,
        dedupLim=deduplication_thresold,
        dedupFunc=deduplication_algo,
        windowsSize=windowSize,
        top=numOfKeywords,
        features=None)
    for i in df["Description"]:
        if (i == ''):
            continue
        keywords = custom_kw_extractor.extract_keywords(i)
        temp = []
        for j in keywords:
            temp.append(j[0])
        lis.append(temp)

    return lis
示例#8
0
    def train(self, documents, **kwargs):
        """Unsupervised train the keyword extractor on a list of documents
        Arguments:
            documents {List[str]} -- [description]
        """

        with open('indexList.csv', newline='') as f:
            reader = csv.reader(f)
            index_keywords_ = list(reader)

        index_keywords = []
        for item in index_keywords_:
            a = (item[0], int(item[1]))
            index_keywords.append(a)

        total_data = ' '.join(documents)
        language = kwargs.get('language', 'en')
        max_ngram_size = self.n_gram
        deduplication_thresold = 0.4  # 0.4 ->
        deduplication_algo = 'seqm'  #
        windowSize = 2
        numOfKeywords = self.total_keywords_in_training

        custom_kw_extractor = yake.KeywordExtractor(
            lan=language,
            n=max_ngram_size,
            dedupLim=deduplication_thresold,
            dedupFunc=deduplication_algo,
            windowsSize=windowSize,
            top=numOfKeywords,
            features=None)

        self.the_total_keywords = index_keywords + custom_kw_extractor.extract_keywords(
            total_data)
示例#9
0
    def train(self, documents, **kwargs):
        """Unsupervised train the keyword extractor on a list of documents
        Arguments:
            documents {List[str]} -- [description]
        """

        total_data = ' '.join(documents)
        language = kwargs.get('language', 'en')
        max_ngram_size = self.n_gram
        deduplication_thresold = 0.7  # 0.4 ->
        deduplication_algo = 'seqm'  #
        windowSize = 1
        numOfKeywords = self.total_keywords_in_training

        custom_kw_extractor = yake.KeywordExtractor(
            lan=language,
            n=max_ngram_size,
            dedupLim=deduplication_thresold,
            dedupFunc=deduplication_algo,
            windowsSize=windowSize,
            top=numOfKeywords,
            features=None)

        self.the_total_keywords = custom_kw_extractor.extract_keywords(
            total_data)
示例#10
0
def extract_keywords():
    workshops_file = 'all-workshops-2021-02-04.csv'
    workshops_df = get_workshops_df(workshops_file)
    
# Writing the choose workshop prompt as a subheader to a list of workshop titles to choose from via a selection box    
    st.subheader("Choose a workshop title to see the top three Yake extracted keywords from the workshop's description")
    title_list = workshops_df["title"].to_list()
    selected_workshop = st.selectbox("Select workshop", title_list)
    
# Writing the title of the chosen workshop   
    st.write(selected_workshop)

# Go through the workshop data frame to find all workshops with the same title selected from the list above    
    filtered_workshops = workshops_df[workshops_df["title"] == selected_workshop]

# For the the workshops filtered by chosen title, apply the custom yake function to the body or descriptions of those workshops  
    custom_kw_extractor = yake.KeywordExtractor(top = 3)
    filtered_workshops["keywords_yake"] = filtered_workshops["body"].apply(get_top_three, args=(custom_kw_extractor,))
    #st.dataframe(filtered_workshops[["body","keywords_yake"]],width = 600)

 # Formatting the returned workshop descriptions with their keywords into two columns   
    col_description, col_keywords = st.beta_columns(2)
    with col_description: 
        st.write(filtered_workshops["body"])
    with col_keywords: 
        st.write(filtered_workshops["keywords_yake"])
        
示例#11
0
def find_keywords(text_in):
    # parameters
    language = "en"
    max_ngram_size = 3
    deduplication_thresold = 0.9
    deduplication_algo = 'jaro'
    windowSize = 1
    numOfKeywords = 150

    # yake extractor initialze with parameters
    custom_kw_extractor = yake.KeywordExtractor(
        lan=language,
        n=max_ngram_size,
        dedupLim=deduplication_thresold,
        dedupFunc=deduplication_algo,
        windowsSize=windowSize,
        top=numOfKeywords,
        features=None)

    # extract keywords based on the parameters
    keywords_output = custom_kw_extractor.extract_keywords(text_in)
    # create dataframe of keywords output
    keywords_df = pd.DataFrame(keywords_output, columns=["word", "score"])
    del keywords_df['score']
    # make a list of just the words (exclude the scores)
    keywords = keywords_df['word'].to_list()
    # since every bi/trigram is going to include a unigram already....
    # lets just create the basis of our word search based on unigrams
    unigram_df = keywords_df[keywords_df['word'].apply(
        lambda x: len(x.split()) == 1)]
    # create column showing bi and trigrams that include the unigrams (for context)
    unigram_df['associated_phrases'] = unigram_df['word'].apply(
        word_in_list, args=(keywords, ))

    return unigram_df
示例#12
0
def extract_keywords(data):
    special_inputs = [':)', '.']
    kw_extractor = yake.KeywordExtractor(n=n_gram)
    num_convs = len(list(data.keys()))
    print('Extracting keywords for all utterances of {} conversations ...'.
          format(num_convs))
    new_data = {}
    new_conv = {}

    for c in tqdm(range(num_convs)):
        conv = list(data.values())[c]['content']
        new_conv = list(data.values())[c]
        num_utts = len(conv)
        for u in range(num_utts):
            utterance = conv[u]['message']
            if utterance in special_inputs:
                utterance = 'i'
            kws = kw_extractor.extract_keywords(utterance)
            #print(kws)
            kws_1 = get_kwd_1(kws)
            kws_2 = get_kwd_2(kws)
            kws_3 = get_kwd_3(kws)
            #print('kw1: {}'.format(kws_1))
            #print('kw2: {}'.format(kws_2))
            #print('kw3: {}'.format(kws_3))
            new_conv['content'][u]['keywords_1'] = kws_1
            new_conv['content'][u]['keywords_2'] = kws_2
            new_conv['content'][u]['keywords_3'] = kws_3

        new_data[list(data.keys())[c]] = new_conv
    print('Done!')
    return new_data
def get_keyword(docs):
    """
    Function to extract keywords using YAKE from the given list of strings.
    
    :param docs: Strings to extract keywords from
    
    <Returns a list of string where each string contains keywords seperated by ','>
    """
    # Params to be passed for YAKE keyword Extractor
    language = "en"
    max_ngram_size = 3
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    numOfKeywords = 1000

    # Initialization
    list_of_keys = list()
    custom_kw_extractor = yake.KeywordExtractor(
        lan=language,
        n=max_ngram_size,
        dedupLim=deduplication_thresold,
        dedupFunc=deduplication_algo,
        top=numOfKeywords,
        features=None)

    # Iterate over each document and get keywords
    for loc, each_article in enumerate(docs):
        keywords = custom_kw_extractor.extract_keywords(each_article)
        temp1 = list()
        for i, j in keywords:
            temp1.append(j)
        list_of_keys.append(",".join(temp1))
    return list_of_keys
示例#14
0
def add_metadata(text):
    print("| **** Extracting metadata from text: Done")
    text = text
    language = "pt"
    max_ngram_size = 1
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = 4

    #######################################################################
    # Extraindo quatro keywords do conteudo de metadados
    #######################################################################

    custom_kw_extractor = yake.KeywordExtractor(
        lan=language,
        n=max_ngram_size,
        dedupLim=deduplication_thresold,
        dedupFunc=deduplication_algo,
        windowsSize=windowSize,
        top=numOfKeywords,
        features=None)
    keywords = custom_kw_extractor.extract_keywords(text)
    # print(len(keywords))
    # print(keywords)
    print("| **** Adding Metadata to instances: Done")
    return (keywords)
示例#15
0
def yake_keyword_extraction(
    text: str, parameters: Dict = parameters
) -> List[Tuple[str, float]]:
    custom_kw_extractor = yake.KeywordExtractor(**parameters)
    keywords = custom_kw_extractor.extract_keywords(text)

    return keywords
示例#16
0
def yake_keywords(
        texts: List[str],
        language: str = "English",
        max_len: int = 1,
        progress_callback: Callable = None) -> List[List[Tuple[str, float]]]:
    """
    Extract keywords using YAKE!.

    Parameters
    ----------
    texts : list
        List of documents.
    language : str
        Selected language.
    max_len : int
        Maximum number of tokens.
    progress_callback : callable
        Function for reporting progress.

    Returns
    -------
    keywords : list
    """
    if progress_callback is None:
        progress_callback = dummy_callback

    language = YAKE_LANGUAGE_MAPPING[language]
    extractor = yake.KeywordExtractor(lan=language, n=max_len)

    keywords = []
    n_docs = len(texts)
    for i, text in enumerate(texts):
        progress_callback(i / n_docs)
        keywords.append(extractor.extract_keywords(text))
    return keywords
示例#17
0
    def __init__(self,
                 path: str,
                 language: str = "en",
                 max_ngram_size: int = 3,
                 numOfKeywords: int = 50,
                 deduplication_threshold: float = 0.9,
                 deduplication_algo: str = 'seqm',
                 windowSize=1,
                 k=3):

        self.language = language
        self.max_ngram_size = max_ngram_size
        self.numOfKeywords = numOfKeywords
        self.deduplication_threshold = deduplication_threshold
        self.deduplication_algo = deduplication_algo
        self.windowSize = windowSize

        self.custom_kw_extractor = yake.KeywordExtractor(
            lan=language,
            n=max_ngram_size,
            dedupLim=deduplication_threshold,
            dedupFunc=deduplication_algo,
            windowsSize=windowSize,
            top=numOfKeywords,
            features=None)

        self.nlp = spacy.load("en_core_web_lg")
        self.vec_len = len(self.nlp("cosine").vector)
        self.k = k
        self.text = open(path, "r", encoding='UTF-8').read()
示例#18
0
def yake_keyword(doc):
    """
    Extracts keywords from the given text using yake.

    Args:
        doc: Paragraph from keywords need to be extracted.

    Returns:
        Returns Keywords extracted from the text document passed.
    """

    language = "en"
    max_ngram_size = 3
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = 20

    extractor = yake.KeywordExtractor(lan=language,
                                      n=max_ngram_size,
                                      dedupLim=deduplication_thresold,
                                      dedupFunc=deduplication_algo,
                                      windowsSize=windowSize,
                                      top=numOfKeywords,
                                      features=None)
    keywords = extractor.extract_keywords(doc)
    keywords = [word for number, word in keywords]
    return keywords
def run_yake(path, key_phrase_dict, no_of_words):
    print('running Yake')
    text = open(path, 'r').read()
    language = "en"
    max_ngram_size = 3
    deduplication_thresold = 0.9
    deduplication_algo = 'seqm'
    windowSize = 1
    numOfKeywords = no_of_words

    custom_kw_extractor = yake.KeywordExtractor(
        lan=language,
        n=max_ngram_size,
        dedupLim=deduplication_thresold,
        dedupFunc=deduplication_algo,
        windowsSize=windowSize,
        top=numOfKeywords,
        features=None)
    keywords = custom_kw_extractor.extract_keywords(text)

    key_phrase_dict['YAKE'] = [word[0] for word in keywords]
    return key_phrase_dict


#print(run_yake('/home/admindell/Documents/topic-modelling/sampletext.txt',{}))
示例#20
0
def keywords1(texts_2):
	global text4, keywords_yake
	keywords_yake = []
	# Reka setup with stopword directory
	text4 = str(texts_2)

	# # Using Rake
	# stop_dir = "SmartStoplist.txt"
	# rake_object = RAKE.Rake(stop_dir)
	# # Extract keywords
	# keywords = rake_object.run(text4)
	# print ("keywords: ", keywords[0:20])

	#Using Yake
	language = "en"
	max_ngram_size = 3
	deduplication_thresold = 0.9
	deduplication_algo = 'seqm'
	windowSize = 1
	numOfKeywords = 20

	custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
	keywords = custom_kw_extractor.extract_keywords(text4)

	for kw in keywords:
		print(kw)
		# print(kw[0])
		keywords_yake.append(kw[0])
	return keywords_yake
示例#21
0
def test_phraseless_example():
    text_content = "- not yet"

    pyake = yake.KeywordExtractor()

    result = pyake.extract_keywords(text_content)
    assert len(result) == 0
示例#22
0
def test_n3_PT():
    text_content = '''
    "Conta-me Histórias." Xutos inspiram projeto premiado. A plataforma "Conta-me Histórias" foi distinguida com o Prémio Arquivo.pt, atribuído a trabalhos inovadores de investigação ou aplicação de recursos preservados da Web, através dos serviços de pesquisa e acesso disponibilizados publicamente pelo Arquivo.pt . Nesta plataforma em desenvolvimento, o utilizador pode pesquisar sobre qualquer tema e ainda executar alguns exemplos predefinidos. Como forma de garantir a pluralidade e diversidade de fontes de informação, esta são utilizadas 24 fontes de notícias eletrónicas, incluindo a TSF. Uma versão experimental (beta) do "Conta-me Histórias" está disponível aqui.
    A plataforma foi desenvolvida por Ricardo Campos investigador do LIAAD do INESC TEC e docente do Instituto Politécnico de Tomar, Arian Pasquali e Vitor Mangaravite, também investigadores do LIAAD do INESC TEC, Alípio Jorge, coordenador do LIAAD do INESC TEC e docente na Faculdade de Ciências da Universidade do Porto, e Adam Jatwot docente da Universidade de Kyoto.
    '''

    pyake = yake.KeywordExtractor(lan="pt", n=3)
    result = pyake.extract_keywords(text_content)
    res = [('Conta-me Histórias', 0.006225012963810038),
           ('LIAAD do INESC', 0.01899063587015275),
           ('INESC TEC', 0.01995432290332246),
           ('Conta-me', 0.04513273690417472),
           ('Histórias', 0.04513273690417472),
           ('Prémio Arquivo.pt', 0.05749361520927859),
           ('LIAAD', 0.07738867367929901), ('INESC', 0.07738867367929901),
           ('TEC', 0.08109398065524037),
           ('Xutos inspiram projeto', 0.08720742489353424),
           ('inspiram projeto premiado', 0.08720742489353424),
           ('Adam Jatwot docente', 0.09407053486771558),
           ('Arquivo.pt', 0.10261392141666957),
           ('Alípio Jorge', 0.12190479662535166),
           ('Ciências da Universidade', 0.12368384021490342),
           ('Ricardo Campos investigador', 0.12789997272332762),
           ('Politécnico de Tomar', 0.13323587141127738),
           ('Arian Pasquali', 0.13323587141127738),
           ('Vitor Mangaravite', 0.13323587141127738),
           ('preservados da Web', 0.13596322680882506)]
    assert result == res

    keywords = [kw[0] for kw in result]
    th = TextHighlighter(max_ngram_size=3)
    textHighlighted = th.highlight(text_content, keywords)
    print(textHighlighted)

    assert textHighlighted == '"<kw>Conta-me Histórias</kw>." <kw>Xutos inspiram projeto</kw> premiado. A plataforma "<kw>Conta-me Histórias</kw>" foi distinguida com o <kw>Prémio Arquivo.pt</kw>, atribuído a trabalhos inovadores de investigação ou aplicação de recursos <kw>preservados da Web</kw>, através dos serviços de pesquisa e acesso disponibilizados publicamente pelo <kw>Arquivo.pt</kw> . Nesta plataforma em desenvolvimento, o utilizador pode pesquisar sobre qualquer tema e ainda executar alguns exemplos predefinidos. Como forma de garantir a pluralidade e diversidade de fontes de informação, esta são utilizadas 24 fontes de notícias eletrónicas, incluindo a TSF. Uma versão experimental (beta) do "<kw>Conta-me Histórias</kw>" está disponível aqui.     A plataforma foi desenvolvida por <kw>Ricardo Campos investigador</kw> do <kw>LIAAD do INESC</kw> <kw>TEC</kw> e docente do Instituto <kw>Politécnico de Tomar</kw>, <kw>Arian Pasquali</kw> e <kw>Vitor Mangaravite</kw>, também investigadores do <kw>LIAAD do INESC</kw> <kw>TEC</kw>, <kw>Alípio Jorge</kw>, coordenador do <kw>LIAAD do INESC</kw> <kw>TEC</kw> e docente na Faculdade de <kw>Ciências da Universidade</kw> do Porto, e <kw>Adam Jatwot docente</kw> da Universidade de Kyoto.'
示例#23
0
def test_n3_EN():
    text_content = '''
    Google is acquiring data science community Kaggle. Sources tell us that Google is acquiring Kaggle, a platform that hosts data science and machine learning   competitions. Details about the transaction remain somewhat vague , but given that Google is hosting   its Cloud Next conference in San Francisco this week, the official announcement could come as early   as tomorrow.  Reached by phone, Kaggle co-founder CEO Anthony Goldbloom declined to deny that the   acquisition is happening. Google itself declined 'to comment on rumors'.   Kaggle, which has about half a million data scientists on its platform, was founded by Goldbloom   and Ben Hamner in 2010. The service got an early start and even though it has a few competitors   like DrivenData, TopCoder and HackerRank, it has managed to stay well ahead of them by focusing on its   specific niche. The service is basically the de facto home for running data science  and machine learning   competitions.  With Kaggle, Google is buying one of the largest and most active communities for   data scientists - and with that, it will get increased mindshare in this community, too   (though it already has plenty of that thanks to Tensorflow and other projects).   Kaggle has a bit of a history with Google, too, but that's pretty recent. Earlier this month,   Google and Kaggle teamed up to host a $100,000 machine learning competition around classifying   YouTube videos. That competition had some deep integrations with the Google Cloud Platform, too.   Our understanding is that Google will keep the service running - likely under its current name.   While the acquisition is probably more about Kaggle's community than technology, Kaggle did build   some interesting tools for hosting its competition and 'kernels', too. On Kaggle, kernels are   basically the source code for analyzing data sets and developers can share this code on the   platform (the company previously called them 'scripts').  Like similar competition-centric sites,   Kaggle also runs a job board, too. It's unclear what Google will do with that part of the service.   According to Crunchbase, Kaggle raised $12.5 million (though PitchBook says it's $12.75) since its   launch in 2010. Investors in Kaggle include Index Ventures, SV Angel, Max Levchin, Naval Ravikant,   Google chief economist Hal Varian, Khosla Ventures and Yuri Milner'''

    pyake = yake.KeywordExtractor(lan="en", n=3)

    result = pyake.extract_keywords(text_content)
    print(result)
    res = [('Google', 0.02509259635302287), ('Kaggle', 0.027297150442917317),
           ('CEO Anthony Goldbloom', 0.04834891465259988),
           ('data science', 0.05499112888517541),
           ('acquiring data science', 0.06029572445726576),
           ('Google Cloud Platform', 0.07461585862381104),
           ('data', 0.07999958986489127),
           ('San Francisco', 0.0913829662674319),
           ('Anthony Goldbloom declined', 0.09740885820462175),
           ('science', 0.09834167930168546),
           ('science community Kaggle', 0.1014394718805728),
           ('machine learning', 0.10754988562466912),
           ('Google Cloud', 0.1136787749431024),
           ('Google is acquiring', 0.114683257931042),
           ('acquiring Kaggle', 0.12012386507741751),
           ('Anthony Goldbloom', 0.1213027418574554),
           ('platform', 0.12404419723925647),
           ('co-founder CEO Anthony', 0.12411964553586782),
           ('CEO Anthony', 0.12462950727635251),
           ('service', 0.1316357590449064)]
    assert result == res

    keywords = [kw[0] for kw in result]
    th = TextHighlighter(max_ngram_size=3)
    textHighlighted = th.highlight(text_content, keywords)
    print(textHighlighted)
    assert textHighlighted == "<kw>Google</kw> is acquiring <kw>data science</kw> community <kw>Kaggle</kw>. Sources tell us that <kw>Google</kw> is acquiring <kw>Kaggle</kw>, a <kw>platform</kw> that hosts <kw>data science</kw> and <kw>machine learning</kw>   competitions. Details about the transaction remain somewhat vague , but given that <kw>Google</kw> is hosting   its Cloud Next conference in <kw>San Francisco</kw> this week, the official announcement could come as early   as tomorrow.  Reached by phone, <kw>Kaggle</kw> co-founder <kw>CEO Anthony Goldbloom</kw> declined to deny that the   acquisition is happening. <kw>Google</kw> itself declined 'to comment on rumors'.   <kw>Kaggle</kw>, which has about half a million <kw>data</kw> scientists on its <kw>platform</kw>, was founded by Goldbloom   and Ben Hamner in 2010. The <kw>service</kw> got an early start and even though it has a few competitors   like DrivenData, TopCoder and HackerRank, it has managed to stay well ahead of them by focusing on its   specific niche. The <kw>service</kw> is basically the de facto home for running <kw>data science</kw>  and <kw>machine learning</kw>   competitions.  With <kw>Kaggle</kw>, <kw>Google</kw> is buying one of the largest and most active communities for   <kw>data</kw> scientists - and with that, it will get increased mindshare in this community, too   (though it already has plenty of that thanks to Tensorflow and other projects).   <kw>Kaggle</kw> has a bit of a history with <kw>Google</kw>, too, but that's pretty recent. Earlier this month,   <kw>Google</kw> and <kw>Kaggle</kw> teamed up to host a $100,000 <kw>machine learning</kw> competition around classifying   YouTube videos. That competition had some deep integrations with the <kw>Google</kw> Cloud <kw>Platform</kw>, too.   Our understanding is that <kw>Google</kw> will keep the <kw>service</kw> running - likely under its current name.   While the acquisition is probably more about Kaggle's community than technology, <kw>Kaggle</kw> did build   some interesting tools for hosting its competition and 'kernels', too. On <kw>Kaggle</kw>, kernels are   basically the source code for analyzing <kw>data</kw> sets and developers can share this code on the   <kw>platform</kw> (the company previously called them 'scripts').  Like similar competition-centric sites,   <kw>Kaggle</kw> also runs a job board, too. It's unclear what <kw>Google</kw> will do with that part of the <kw>service</kw>.   According to Crunchbase, <kw>Kaggle</kw> raised $12.5 million (though PitchBook says it's $12.75) since its   launch in 2010. Investors in <kw>Kaggle</kw> include Index Ventures, SV Angel, Max Levchin, Naval Ravikant,   <kw>Google</kw> chief economist Hal Varian, Khosla Ventures and Yuri Milner"
def get_keywords(text):  # TODO: experiment to find best parameters
    """Extracts keywords from given text

    Extracts keywords from given text using keyword extraction algorithm YAKE. Currently uses basic parameters
    for algorithm, which can be optimized. For explanation of parameters, see YAKE documentation.

    :type text: str
    :param text: to be processed text, most likely created by get_text_for_character() or get_text_for_chapter()
    :return: extracted keywords as tuple including confidence as float, e.g. ('keyword', 0.042)
    :rtype: tuple
    """
    max_ngram_size = 1
    deduplication_threshold = 0.9
    deduplication_algo = "eqm"
    window_size = 1
    num_of_keywords = 20
    kw_extractor = yake.KeywordExtractor(
        lan="en",
        n=max_ngram_size,
        dedupLim=deduplication_threshold,
        dedupFunc=deduplication_algo,
        windowsSize=window_size,
        top=num_of_keywords,
        features=None,
    )  # FIXME stopwords don't seem to be working
    keywords = kw_extractor.extract_keywords(text)
    return keywords
示例#25
0
def test_null_and_blank_example():
    pyake = yake.KeywordExtractor()
    
    result = pyake.extract_keywords("")
    assert len(result) == 0

    result = pyake.extract_keywords(None)
    assert len(result) == 0
def test_extraction_with_YAKE():
    yake_extractor = yake.KeywordExtractor(lan="el", top=5)
    while True:
        input_doc = input()
        if input_doc == "end":
            break
        output = extract_keywords_YAKE(yake_extractor, input_doc)
        print(output)
    def keywordService(text):
        kw_extractor = yake.KeywordExtractor()
        keywords = kw_extractor.extract_keywords(text)
        keyword_priority_list=[]
        for kw in keywords:
            keyword_priority_list.append(kw)

        return keyword_priority_list
def compute_keywords(df):

    import yake

    n_keywords = 2
    kw_extractor = yake.KeywordExtractor()
    custom_kw_extractor = yake.KeywordExtractor(lan="en",
                                                n=2,
                                                dedupLim=0.9,
                                                top=n_keywords,
                                                features=None)

    keywords = {}
    for i, dx in df.groupby(df["cluster"]):
        text = "\n".join(dx["text"].values)
        kw = [x[0] for x in custom_kw_extractor.extract_keywords(text)]
        keywords[i] = "; ".join(kw)
    return keywords
示例#29
0
def extract_tags(text):
    simple_kwextractor = yake.KeywordExtractor()
    post_keywords = simple_kwextractor.extract_keywords(text)
    post_keywords = list(set(post_keywords))
    sentence_output = ""
    for word, number in post_keywords[:2]:
        sentence_output += word + " "

    return sentence_output
 def __init__(self, max_ngram_size=3, window_size=1, **kwargs):
     super().__init__(**kwargs)
     self.name = kwargs.get('name', 'Yake')
     self.max_ngram_size = max_ngram_size
     self.window_size = window_size
     self.kw_extractor = yake.KeywordExtractor(
         n=self.max_ngram_size,
         windowsSize=self.window_size,
     )