def word_detector(blob_text):
    keyword_detector = RakunDetector(hyperparameters)
    keywords = keyword_detector.find_keywords(blob_text, input_type="text")
    #print(type(keywords))
    keyword_detector.verbose = False
    words = [i for i, j in keywords]
    #print("Keywords identified are: ", words)
    #keyword_detector.visualize_network()

    return (words)
Пример #2
0
def test_basic_keywords1():

    hyperparameters = {"distance_threshold":3,
                       "num_keywords" : 10,
                       "pair_diff_length":2,
                       "distance_method" : "editdistance",
                       "stopwords" : stopwords.words('english'),
                       "bigram_count_threshold":2,
                       "num_tokens":[1]}

    keyword_detector = RakunDetector(hyperparameters)
    example_data = "./datasets/wiki20/docsutf8/7183.txt"
    keywords = keyword_detector.find_keywords(example_data)
Пример #3
0
def test_basic_visualization():

    hyperparameters = {
        "edit_distance_threshold": 3,
        "num_keywords": 10,
        "pair_diff_length": 2,
        "stopwords": stopwords.words('english'),
        "bigram_count_threshold": 2,
        "lemmatizer": WordNetLemmatizer(),
        "num_tokens": [1]
    }

    keyword_detector = RakunDetector(hyperparameters)
    example_data = "./datasets/wiki20/docsutf8/7183.txt"
    keywords = keyword_detector.find_keywords(example_data)
    keyword_detector.visualize_network()
Пример #4
0
def test_basic_keywords2(infile):

    """
    A test across multiple files + parameter sets
    """
    
    for flx in all_relevant[0:10]:
        hyperparameters = {"distance_threshold":randrange(5),
                           "num_keywords" : randrange(10),
                           "pair_diff_length":randrange(5),
                           "distance_method" : "editdistance",
                           "stopwords" : stopwords.words('english'),
                           "bigram_count_threshold":randrange(5),
                           "max_occurrence" : randrange(10),
                           "max_similar" : randrange(4),
                           "num_tokens":[1,2,3]}
        keyword_detector = RakunDetector(hyperparameters)
        keywords = keyword_detector.find_keywords(flx)
Пример #5
0
## detect keywords from text directly!

from mrakun import RakunDetector
from nltk.corpus import stopwords

blob_of_text = "Brexit (/ˈbrɛksɪt, ˈbrɛɡzɪt/;[1] a portmanteau of \"British\" and \"exit\") is the scheduled withdrawal of the United Kingdom (UK) from the European Union (EU). Following a June 2016 referendum, in which 51.9% voted to leave, the UK government formally announced the country's withdrawal in March 2017, starting a two-year process that was due to conclude with the UK withdrawing on 29 March 2019. As the UK parliament thrice voted against the negotiated withdrawal agreement, that deadline has been extended twice, and is currently 31 October 2019.[2][3] An Act of Parliament requires the government to seek a third extension if no agreement is reached before 19 October. Withdrawal is advocated by Eurosceptics and opposed by pro-Europeanists, both of whom span the political spectrum. The UK joined the European Communities (EC) in 1973, with continued membership endorsed in a 1975 referendum. In the 1970s and 1980s, withdrawal from the EC was advocated mainly by the political left, e.g. in the Labour Party's 1983 election manifesto. From the 1990s, the eurosceptic wing of the Conservative Party grew, and led a rebellion over ratification of the 1992 Maastricht Treaty that established the EU. In parallel with the UK Independence Party (UKIP), and the cross-party People's Pledge campaign, it pressured Conservative Prime Minister David Cameron to hold a referendum on continued EU membership. Cameron, who had campaigned to remain, resigned after the result and was succeeded by Theresa May. On 29 March 2017, the UK government invoked Article 50 of the Treaty on European Union, formally starting the withdrawal. May called a snap general election in June 2017, which resulted in a Conservative minority government supported by the Democratic Unionist Party. UK–EU withdrawal negotiations began later that month. The UK negotiated to leave the EU customs union and single market. This resulted in the November 2018 withdrawal agreement, but the UK parliament voted against ratifying it three times. The Labour Party wanted any agreement to maintain a customs union, while many Conservatives opposed the agreement's financial settlement on the UK's share of EU financial obligations, as well as the Irish backstop designed to prevent border controls in Ireland. The Liberal Democrats, Scottish National Party and others seek to reverse Brexit through a second referendum. The EU has declined a re-negotiation that omits the backstop. In March 2019, the UK parliament voted for May to ask the EU to delay Brexit until October. Having failed to pass her agreement, May resigned as Prime Minister in July and was succeeded by Boris Johnson. He sought to replace parts of the agreement and vowed to leave the EU by the new deadline, with or without an agreement."

hyperparameters = {
    "distance_threshold": 4,
    "distance_method": "editdistance",
    "num_keywords": 10,
    "pair_diff_length": 3,
    "stopwords": stopwords.words('english'),
    "bigram_count_threshold": 2,
    "num_tokens": [1, 2]
}

keyword_detector = RakunDetector(hyperparameters)
keywords = keyword_detector.find_keywords(blob_of_text, input_type="text")
print(keywords)

## inspect the network
keyword_detector.visualize_network(display=True)

## store the network as a high quality image
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
figure(num=None, figsize=(3, 3), dpi=350, facecolor='w', edgecolor='k')
keyword_detector.visualize_network(display=False)
plt.savefig("HighResFigure.png")
Пример #6
0
from mrakun import RakunDetector
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from os import path

if not path.exists("../pretrained_models/fasttext/wiki.en.bin"):
    print(
        "Please, load a fasttext english pretrained model binary into folder ../pretrained_models."
    )

hyperparameters = {
    "distance_threshold": 0.2,
    "distance_method": "fasttext",
    "pretrained_embedding_path": '../pretrained_models/fasttext/wiki.en.bin',
    "num_keywords": 10,
    "pair_diff_length": 2,
    "stopwords": stopwords.words('english'),
    "bigram_count_threshold": 2,
    "lemmatizer": WordNetLemmatizer(),
    "num_tokens": [1]
}

keyword_detector = RakunDetector(hyperparameters)
example_data = "../datasets/wiki20/docsutf8/7183.txt"
keywords = keyword_detector.find_keywords(example_data)
print(keywords)
keyword_detector.visualize_network()
Пример #7
0
#blob_of_text = "Slovenska nogometna reprezentanca se je s porazom 2:3 na gostovanju na Poljskem poslovila od kvalifikacij za evropsko prvenstvo 2020. Poljska je na razprodanem stadionu v Varšavi proslavila prestižno zmago v skupini G in četrto zaporedno uvrstitev na največje tekmovanje stare celine. Slovenija znova ni izpolnila cilja – kvalifikacije je končala na skromnem četrtem mestu, daleč od uvrstitve na veliko tekmovanje.Slovenija v Varšavi ni zgolj sklenila neuspešnih kvalifikacij, ampak je hkrati odprla priprave za ligo narodov 2020. V ta namen je selektor Matjaž Kek iskal odgovore o selekciji igralcev že danes na Poljskem, ko je na igrišče poslal premešano začetno postavo in osvežil različne položaje. V obrambi je kaznovanega štoperja Aljaža Struno zamenjal Miha Blažič, na levem boku pa je namesto Bojana Jokića zaigral Jure Balkovec. Napadalec Tim Matavž je iz udarne enajsterice izrinil Andraža Šporarja, priložnost od prve minute je na sredini igrišča dobil Jaka Bijol. Prenovljena izbrana vrsta se je tako kot v celotnih kvalifikacijah tudi v Varšavi predstavila z veliko nihanji.Začelo se je z neodgovornim prekrškom Kurtića za rumeni karton že po minuti igre in hitrim vodstvom Poljske po podaji iz kota. Nič ni pomagalo, da je slovenska vrsta protestirala, da je bil v vratarjevem prostoru storjen prekršek nad Janom Oblakom, ki je začutil bolečine v ramenu. Slovenija je po zaostanku poskušala igrati in izenačila po domiselni akciji Kurtića in Iličića, ki sta izigrala Poljake, da je Matavž hladnokrvno zadel za izenačenje. Nasprotnika sta se do konca polčasa izmenjavala v premoči, na stadionu pa je najbolj završalo pred odmorom, ko je na vnaprej dogovorjeno menjavo odšel poljski branilec Piszczek in se čustveno poslovil od nastopanja za reprezentanco.Tekma v Varšavi je minila brez posebnega tekmovalnega naboja na najvišji ravni. Poteza večera je uspela prvemu zvezdniku Poljske Robertu Lewandowskemu, ki je na začetku drugega polčasa zlahka preigral vso slovensko obrambo in zadel za drugo vodstvo domačih. Slovenija je popustila v disciplini, a se je po hitri akciji natančnih podaj in zadetku Iličića vrnila v igro. V odprtem boju za zmago so bili drznejši Poljaki, ki so v zmagoviti akciji izigrali gostujočo obrambo in se oddolžili navijačem za edini poraz kvalifikacij v Stožicah. Slovenija je tekmo končala z deseterico, saj je drugi rumeni karton prejel Kurtić, Keku pa se je ponesrečila menjava Bijola z Zajcem.»V Varšavi sem videl veliko dobrih stvari, a tudi veliko neodločnosti in neodgovornosti. Prelahko smo dopustili, da so Poljaki zabili tri gole. Posamezniki bodo morali razčistiti sami pri sebi. Preslabi smo bili v hitrih reakcijah. Škoda zapravljene točke,« je za nacionalno televizijo povedal selektor Matjaž Kek."

blob_of_text = "Prehrambni trendi se v zadnjih letih obračajo h koreninam. Vedno večja skupina ozaveščenih potrošnikov goji pričakovanja po naravnih, lokalno pridelanih in preprostih izdelkih, ki ne vsebujejo dodanih sladkorjev, barvil, arom ali drugih aditivov. Potrošniki se kot odgovor na svoj življenjski slog na trgu ozirajo po avtentičnih in naravnih živilih, strokovnjaki pa to obliko nakupnega vedenja opredeljujejo kot enega najpomembnejših prehranskih trendov v letu 2019. V tujini trend opisujejo z izrazom clean label oziroma čisto označevanje: »Jasno je, da gre za potrošniško zahtevo po čistih izdelkih, ki se nanaša na preglednost označb v smislu razumljivosti. Čedalje več potrošnikov si namreč želi uživati izdelke iz naravnih in enostavnih sestavin, ki jih razumejo in jim zaupajo. Clean label pomeni kratek seznam sestavin in njihovo razumljivost. Pomeni odsotnost umetnih dodatkov,« pojasnjujejo v Ljubljanskih mlekarnah.Razvoj izdelkov mora slediti zahtevam potrošnikov V Ljubljanskih mlekarnah so pred tremi leti pričeli s projektom clean label, v okviru katerega so med drugim prenovili vse sadne jogurte blagovne znamke Mu. Pred dobrim letom pa so storili še korak naprej in predstavili novo linijo izdelkov, jogurte Mu Natur. Njihova edinstvena prednost je, da so sestavljeni zgolj iz dveh sestavin – jogurta in sadja in so trenutno edini jogurti s sadjem brez dodanega sladkorja na slovenskem tržišču. Vendar pa je razvoj izdelkov po načelu manj je več zelo zahteven. Kot pojasnjujejo v Ljubljanskih mlekarnah, je bilo ključno vprašanje pri razvoju prvega tovrstnega izdelka, kako združiti samo dve sestavini, naravni jogurt in sadje brez vseh ostalih dodatkov, hkrati pa potrošnikom predstaviti kakovosten in okusen izdelek s primerno dolgim rokom uporabe. Da jim je to uspelo, je na koncu potrdila tudi Biotehniška fakulteta Univerze v Ljubljani s certifikatom 100 % naravno. Ugledni certifikat predstavlja jamstvo potrošnikom, da bodo zaužili pristne in kakovostne izdelke, ki vsebujejo le naravno prisotna sladkorja fruktozo in galaktozo, sestavini v sadju in mleku. Biotehniška fakulteta pa je vključena tudi pri sami proizvodnji jogurtov Mu Natur, in sicer v obliki rednega preverjanja ustreznosti izdelkov. Slovenke in Slovenci izbrali nov okus jogurtov Mu Natur breskev/mango Pozitiven odziv potrošnikov na jogurte Mu Natur je na Ljubljanskih mlekarnah botroval odločitvi, da na trgu predstavijo nov okus, ki se bo pridružil jagodi in borovnici. Pri razvoju okusa so želeli narediti korak naprej, zato so vanj vključili tudi potrošnike in jih povabili, da pomagajo pri sprejemu ključne odločitve v fazi razvoja izdelka – kakšen naj bo nov okus jogurta Mu Natur? Da bi preverili mnenje potrošnikov, so v Ljubljanskih mlekarnah konec avgusta v različnih slovenskih krajih organizirali degustacije, na katerih so imeli potrošniki možnost poskusiti tri okuse jogurta Mu Natur in glasovati za najljubšega. V sproščenem vzdušju so mladi in mladi po srcu okušali vzorce, med seboj delili mnenja in se zabavali ob stand-up vložkih komika Jana Kreuzerja z ekipo. »Pripravljenost ljudi, da nam pomagajo sprejeti odločitev glede novega okusa jogurta Mu Natur, je bila nad vsemi pričakovanji. To priložnost bi izkoristili, da se zahvalimo vsem sodelujočim in jim sporočimo, da smo njihovo mnenje seveda spoštovali. Največ glasov je prejel okus Mu Natur breskev/mango, ki je v teh dneh že na voljo na prodajnih mestih po Sloveniji,« so še sporočili iz Ljubljanskih mlekarn."

hyperparameters = {
    "distance_threshold": 5,
    "distance_method": "editdistance",
    "num_keywords": 20,
    "pair_diff_length": 2,
    "bigram_count_threshold": 1,
    "max_occurrence": 1,
    "max_similar": 1,
    "num_tokens": [1, 2, 3]
}

keyword_detector = RakunDetector(hyperparameters)
keywords = keyword_detector.find_keywords(blob_of_text, input_type="text")
print(keywords)
keyword_detector.default_visualization_parameters = {
    "top_n": 15,
    "max_node_size": 12,
    "min_node_size": 2,
    "label_font_size": 10,
    "text_color": "red",
    "num_layout_iterations": 100,
    "edge_width": 0.08,
    "alpha_channel": 0.5
}
#keyword_detector.visualize_network()
Пример #8
0
## multi term keyword example

from mrakun import RakunDetector
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

blob_of_text = "Brexit (/ˈbrɛksɪt, ˈbrɛɡzɪt/;[1] a portmanteau of \"British\" and \"exit\") is the scheduled withdrawal of the United Kingdom (UK) from the European Union (EU). Following a June 2016 referendum, in which 51.9% voted to leave, the UK government formally announced the country's withdrawal in March 2017, starting a two-year process that was due to conclude with the UK withdrawing on 29 March 2019. As the UK parliament thrice voted against the negotiated withdrawal agreement, that deadline has been extended twice, and is currently 31 October 2019.[2][3] An Act of Parliament requires the government to seek a third extension if no agreement is reached before 19 October. Withdrawal is advocated by Eurosceptics and opposed by pro-Europeanists, both of whom span the political spectrum. The UK joined the European Communities (EC) in 1973, with continued membership endorsed in a 1975 referendum. In the 1970s and 1980s, withdrawal from the EC was advocated mainly by the political left, e.g. in the Labour Party's 1983 election manifesto. From the 1990s, the eurosceptic wing of the Conservative Party grew, and led a rebellion over ratification of the 1992 Maastricht Treaty that established the EU. In parallel with the UK Independence Party (UKIP), and the cross-party People's Pledge campaign, it pressured Conservative Prime Minister David Cameron to hold a referendum on continued EU membership. Cameron, who had campaigned to remain, resigned after the result and was succeeded by Theresa May. On 29 March 2017, the UK government invoked Article 50 of the Treaty on European Union, formally starting the withdrawal. May called a snap general election in June 2017, which resulted in a Conservative minority government supported by the Democratic Unionist Party. UK–EU withdrawal negotiations began later that month. The UK negotiated to leave the EU customs union and single market. This resulted in the November 2018 withdrawal agreement, but the UK parliament voted against ratifying it three times. The Labour Party wanted any agreement to maintain a customs union, while many Conservatives opposed the agreement's financial settlement on the UK's share of EU financial obligations, as well as the Irish backstop designed to prevent border controls in Ireland. The Liberal Democrats, Scottish National Party and others seek to reverse Brexit through a second referendum. The EU has declined a re-negotiation that omits the backstop. In March 2019, the UK parliament voted for May to ask the EU to delay Brexit until October. Having failed to pass her agreement, May resigned as Prime Minister in July and was succeeded by Boris Johnson. He sought to replace parts of the agreement and vowed to leave the EU by the new deadline, with or without an agreement."

hyperparameters = {
    "distance_threshold": 4,
    "distance_method": "editdistance",
    "num_keywords": 10,
    "pair_diff_length": 3,
    "stopwords": stopwords.words('english'),
    "bigram_count_threshold": 0,
    "lemmatizer": lemmatizer,
    "num_tokens": [2]
}

keyword_detector = RakunDetector(hyperparameters)
keywords = keyword_detector.find_keywords(blob_of_text, input_type="text")
print(keywords)
Пример #9
0
from mrakun import RakunDetector
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

hyperparameters = {"edit_distance_threshold":3,
                   "num_keywords" : 10,
                   "pair_diff_length":2,
                   "stopwords" : stopwords.words('english'),
                   "bigram_count_threshold":2,
                   "lemmatizer" : WordNetLemmatizer(),
                   "num_tokens":[1]}

keyword_detector = RakunDetector(hyperparameters)
example_data = "../datasets/wiki20/docsutf8/7183.txt"
keywords = keyword_detector.find_keywords(example_data)
print(keywords)
keyword_detector.visualize_network()
keyword_detector.verbose = False
keyword_detector.validate_on_corpus("../datasets/Schutz2008")