def get_spam_keywords(spam_features, ham_features): #POPULATE THE SPAM AND HAM TEXT BLOBS text_spam = '' text_ham = '' for pr in spam_features: text_spam += get_keywords(pr) for pr in ham_features: text_ham += get_keywords(pr) text_spam = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text_spam) text_ham = re.sub('[^a-zA-Z0-9 \n\.]', ' ', text_ham) # print(text_spam,"\n------------------------------------------\n",text_ham) #INITIALISE RAKE FOR POPULAR WORDS rake = Rake(max_words=2, min_freq=5) #EXTRACT POPULAR KEYWORDS FOR SPAM AND HAM keywords_spam = rake.apply(text_spam.lower()) keywords_ham = rake.apply(text_ham.lower()) # print(keywords_ham) # print(keywords_spam) spam = [spam_keyword[0] for spam_keyword in keywords_spam[:50]] ham = [ham_keyword[0] for ham_keyword in keywords_ham[:50]] # GENERATE KEYWORDS PRESENT IN SPAM WHICH ARE NOT PRESENT IN HAM spam_final = [] for word in spam: if word not in ham: spam_final.append(word) # print(spam_final) return spam_final
class TextPreprocessor(object): def __init__(self, lang=None): self.lang = lang self.rake = Rake(language_code=self.lang, max_words=5) def key_words(self, text): return self.rake.apply(text)
def get(self, language_code=None): rake = Rake(language_code=language_code) text = request.form.get('text') if text: return rake.apply(text) return 'No text given', 400
def post(): posted_data = request.get_json() text = posted_data['text'] rake = Rake() keywords = rake.apply(text) text = [i[0] for i in keywords] return jsonify({'Keywords': text})
def get_RAKE(article): rake = Rake() keywords = rake.apply(article) topKeywords = [] for i in range(10): topKeywords.append(keywords[i][0]) #print(topKeywords) return topKeywords
def getKeyWords(text): rake = Rake() keywords = rake.apply(text) sortedKw = [] for keyword in keywords: if keyword[1] > RANK: sortedKw.append(keyword[0]) else: pass return sortedKw
def get_keywords(self, article): """ Find the keywords in article and return them in a convenient way. :params: article, list of sentences, sentences are lists of strings :returns: keywords, list of strings -- extracted keywords """ #here we save the labels that will NOT be changed labeltype = set() if self.label is not None: if self.label == 'neutral': labeltype.add('B-SPAN') labeltype.add('I-SPAN') elif self.label == 'propaganda': labeltype.add('0') #additional variable that holds parts of speech that will NOT be changed pos_shortcuts = {'NN': 'n', 'JJ': 'adj', 'RB': 'adv', 'VB': 'v'} wordtypes = set() if self.postype is not None: wordtypes = {'n', 'adj', 'adv', 'v'} for fig in self.postype: wordtypes.discard(fig) text = self.get_text(article) text = ''.join(c for c in text if c not in '\'\"') rake = Rake() try: fig = rake.apply(text) except: print('Couldn\'t find keywords, falling back to all words.') fig = self.get_words(article) return fig raw_keywords = [] for string, _ in fig: raw_keywords += string.split() raw_keywords = set(raw_keywords) keywords = {} for i in range(len(article)): sentence = article[i] for comb in sentence: word, label = comb.split() word = word.lower() pos = nltk.pos_tag([word])[0][1] if pos in pos_shortcuts: pos = pos_shortcuts[pos] if word in raw_keywords and label not in labeltype and pos not in wordtypes: keywords[word] = i return keywords
def _fetch_all_sentences_keywords(self): """ For each sentence object in self.video.sentences, the method gets the keywords and saves them in sentence.keywords :return: None """ rake = Rake() for sentence in self.video.sentences: keywords_result = rake.apply(sentence.text, text_for_stopwords=None) keywords = [keyword[0] for keyword in keywords_result ] # Getting just keywords, without accuracy sentence.keywords = keywords
def getKeywords(text): tokens = text.split() processedTokens = [] if len(tokens) < 3: processedTokens = text.split() else: rake = Rake() keywords = rake.apply(text) for i in keywords: tempHold = i[0].split() for j in tempHold: processedTokens.append(j) return processedTokens
def extract_keywords(text): nlkt_text = word_tokenize(text) val = nltk.pos_tag(nlkt_text) rake = Rake() keywords = rake.apply(text) new_val = [] keyword = 'nothing' for i in range(len(val)): if ((val[i][1] == 'NN')): new_val.append(val[i][0]) for i in range(len(keywords)): for j in range(len(new_val)): if (keywords[i][0] == new_val[j]): keyword = new_val[j] return (keyword)
def extract_keywords(self, max_words=1, min_freq=5, num_top_words=10): stop_words = get_stop_words('fr') rake = Rake(max_words=max_words, min_freq=min_freq, language_code="fr", stopwords=stop_words) for i, label in enumerate(np.unique(self.labels)): corpus_fr = ' '.join(self.data[self.labels == label]) keywords = rake.apply(corpus_fr) top_words = np.array(keywords[:num_top_words])[:, 0] self.keywords["Cluster {0}".format(label)] = top_words return self.keywords
def process(): keywords = [] text = request.form['text_to_process'] max_kw_length = int(request.form['max_kw_length']) if not text: abort(404) if request.method == 'POST': f = open("data/stopwords.txt", "r") sw = f.read() rake = Rake(language_code='id', max_words=max_kw_length, stopwords=set(sw.split("\n"))) keywords = rake.apply(text) return render_template('process.html', keywords=keywords, text=text, max_kw_length=max_kw_length)
def get_keywords(text): text = ( text ) rake = Rake( min_chars=3, max_words=1, min_freq=1, language_code='es', stopwords=None, lang_detect_threshold=100, max_words_unknown_lang=10, generated_stopwords_percentile=80, generated_stopwords_max_len=10, generated_stopwords_min_freq=2, ) keywords = rake.apply( text, text_for_stopwords=None, ) return keywords
from multi_rake import Rake import csv #text = input() text = "please tell me the good" rake = Rake() keywords = rake.apply(text) #good = 1, bad = 0 words = dict(good=1, bad=0) totalcount = 0 goodcount = 0 for word in keywords: if word[0] in words: totalcount += 1 if word[1]: goodcount += 1 print(goodcount / totalcount)
pdftxt = "" #The while loop will read each page. while count < num_pages: pageObj = pdfReader.getPage(count) count +=1 pdftxt += pageObj.extractText() txt = pdftxt st.write("File Upload Successful") lang = detect(txt) str1 = "Detected Origin of language : " + ilc.language_name(lang) st.write(str1) #----- RAKE rake = Rake(language_code='es', max_words=1) rakekeywords = rake.apply(txt) if len(rakekeywords) > 25 : rakekeywords = rakekeywords[:25] #----- YAKE max_ngram_size = 3 deduplication_thresold = 0.9 deduplication_algo = 'seqm' windowSize = 1 numOfKeywords = 25 custom_kw_extractor = yake.KeywordExtractor(lan=lang, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) yakekeywords = custom_kw_extractor.extract_keywords(txt) st.write("Extracting keywords now ...\n")
def getKey(text): rake = Rake() keywords = rake.apply(text) return keywords[:7]
print(len(wordsData)) new_dict[dt]['words'] = wordsData important_words = {} if (len(wordsData) < 5000000): keyw = rake_object.run(wordsData) important_words['rake1'] = keyw[:10] print(dt) print("Normal Rake: ", keyw[:10]) gen1 = keywords(wordsData, words=10, scores=True, lemmatize=True) important_words['gen1'] = gen1 print("Gen: ", gen1) rake_keys = rake.apply(wordsData) important_words['rake2'] = rake_keys[:10] print("Second Rake:", rake_keys[:10]) new_dict[dt]['key_words'] = important_words with open('raked_dict_upssdated.json', 'w') as f: json.dump(new_dict, f) # print(dt) # print(wordsData) # try # cv=CountVectorizer(max_df=0.85,stop_words=stopwords,max_features=10000) # dt_mat = cv.fit_transform(wordsData) #print(list(cv.vocabulary_.keys())[:10])
import logging from gensim.summarization import keywords from multi_rake import Rake from helper import Recipes logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s', datefmt='%d-%b-%y %H:%M:%S') # load recipes logging.info("Loading recipes from mongodb") recipes = Recipes(limit=10000) recipes = recipes.load() recipes_text = [recipe["text"] for recipe in recipes] # get keywords logging.info("Extracting keywords.") #keywords = keywords(text=" ".join(recipes_text), words=10, scores=True, pos_filter=("NN", "NNS")) rake = Rake(max_words=1, language_code="de", min_freq=500) keywords = rake.apply(" ".join(recipes_text))[:20] print(keywords)
def test_rake(): rake = Rake( min_chars=3, max_words=3, min_freq=1, lang_detect_threshold=50, max_words_unknown_lang=2, generated_stopwords_percentile=80, generated_stopwords_max_len=3, generated_stopwords_min_freq=2, ) text_en = ( 'Compatibility of systems of linear constraints over the set of ' 'natural numbers. Criteria of compatibility of a system of linear ' 'Diophantine equations, strict inequations, and nonstrict inequations ' 'are considered. Upper bounds for components of a minimal set of ' 'solutions and algorithms of construction of minimal generating sets ' 'of solutions for all types of systems are given. These criteria and ' 'the corresponding algorithms for constructing a minimal supporting ' 'set of solutions can be used in solving all the considered types of ' 'systems and systems of mixed types.') result = rake.apply(text_en) result = _postprocess_result(result) expected = [ ('minimal generating sets', 8.666666666666666), ('linear diophantine equations', 8.5), ('minimal supporting set', 7.666666666666666), ('minimal set', 4.666666666666666), ('linear constraints', 4.5), ('natural numbers', 4.0), ('strict inequations', 4.0), ('nonstrict inequations', 4.0), ('upper bounds', 4.0), ('mixed types', 3.666666666666667), ('considered types', 3.166666666666667), ('set', 2.0), ('types', 1.6666666666666667), ('considered', 1.5), ('compatibility', 1.0), ('systems', 1.0), ('criteria', 1.0), ('system', 1.0), ('components', 1.0), ('solutions', 1.0), ('algorithms', 1.0), ('construction', 1.0), ('constructing', 1.0), ('solving', 1.0), ] expected = _postprocess_result(expected) assert result == expected rake_en = Rake( min_chars=3, max_words=3, min_freq=1, language_code='en', ) result = rake_en.apply(text_en) result = _postprocess_result(result) assert result == expected rake_with_stopwords = Rake( min_chars=3, max_words=3, min_freq=1, stopwords={'of', 'the', 'a', 'and'}, lang_detect_threshold=50, max_words_unknown_lang=2, generated_stopwords_percentile=80, generated_stopwords_max_len=3, generated_stopwords_min_freq=2, ) result = rake_with_stopwords.apply(text_en) result = _postprocess_result(result) expected = [ ('linear constraints over', 9.0), ('linear diophantine equations', 9.0), ('minimal generating sets', 8.666666666666666), ('minimal supporting set', 7.666666666666666), ('systems are given', 7.5), ('minimal set', 4.666666666666666), ('natural numbers', 4.0), ('strict inequations', 4.0), ('considered types', 4.0), ('mixed types', 4.0), ('these criteria', 3.5), ('set', 2.0), ('systems', 1.5), ('criteria', 1.5), ('compatibility', 1.0), ('system', 1.0), ('solutions', 1.0), ('algorithms', 1.0), ('construction', 1.0), ] expected = _postprocess_result(expected) assert result == expected text_esperanto = ( 'Liberalismo estas politika filozofio aŭ mondrigardo konstruita en ' 'ideoj de libereco kaj egaleco. Liberaluloj apogas larĝan aron de ' 'vidpunktoj depende de sia kompreno de tiuj principoj, sed ĝenerale ' 'ili apogas ideojn kiel ekzemple liberaj kaj justaj elektoj, ' 'civitanrajtoj, gazetara libereco, religia libereco, libera komerco, ' 'kaj privata posedrajto. Liberalismo unue iĝis klara politika movado ' 'dum la Klerismo, kiam ĝi iĝis populara inter filozofoj kaj ' 'ekonomikistoj en la okcidenta mondo. Liberalismo malaprobis heredajn ' 'privilegiojn, ŝtatan religion, absolutan monarkion kaj la Didevena ' 'Rajto de Reĝoj. La filozofo John Locke de la 17-a jarcento ofte ' 'estas meritigita pro fondado de liberalismo kiel klara filozofia ' 'tradicio. Locke argumentis ke ĉiu h**o havas naturon rekte al vivo, ' 'libereco kaj posedrajto kaj laŭ la socia ' 'kontrakto, registaroj ne rajtas malobservi tiujn rajtojn. ' 'Liberaluloj kontraŭbatalis tradician konservativismon kaj serĉis ' 'anstataŭigi absolutismon en registaroj per reprezenta demokratio kaj ' 'la jura hegemonio.') result = rake.apply(text_esperanto) result = _postprocess_result(result) expected = [ ('vidpunktoj depende', 4.0), ('sia kompreno', 4.0), ('tiuj principoj', 4.0), ('justaj elektoj', 4.0), ('libera komerco', 4.0), ('okcidenta mondo', 4.0), ('ŝtatan religion', 4.0), ('absolutan monarkion', 4.0), ('didevena rajto', 4.0), ('socia kontrakto', 4.0), ('jura hegemonio', 4.0), ('gazetara libereco', 3.5), ('religia libereco', 3.5), ('privata posedrajto', 3.5), ('libereco', 1.5), ('posedrajto', 1.5), ('ideoj', 1.0), ('egaleco', 1.0), ('civitanrajtoj', 1.0), ('klerismo', 1.0), ('ekonomikistoj', 1.0), ('reĝoj', 1.0), ('laŭ', 1.0), ] expected = _postprocess_result(expected) assert result == expected rake_max_words_unknown_lang_none = Rake( min_chars=3, max_words=3, min_freq=1, lang_detect_threshold=50, max_words_unknown_lang=None, generated_stopwords_percentile=80, generated_stopwords_max_len=3, generated_stopwords_min_freq=2, ) result = rake_max_words_unknown_lang_none.apply(text_esperanto) result = _postprocess_result(result) expected = [ ('filozofo john locke', 9.0), ('serĉis anstataŭigi absolutismon', 9.0), ('vidpunktoj depende', 4.0), ('sia kompreno', 4.0), ('tiuj principoj', 4.0), ('justaj elektoj', 4.0), ('libera komerco', 4.0), ('okcidenta mondo', 4.0), ('ŝtatan religion', 4.0), ('absolutan monarkion', 4.0), ('didevena rajto', 4.0), ('socia kontrakto', 4.0), ('jura hegemonio', 4.0), ('gazetara libereco', 3.5), ('religia libereco', 3.5), ('privata posedrajto', 3.5), ('libereco', 1.5), ('posedrajto', 1.5), ('ideoj', 1.0), ('egaleco', 1.0), ('civitanrajtoj', 1.0), ('klerismo', 1.0), ('ekonomikistoj', 1.0), ('reĝoj', 1.0), ('laŭ', 1.0), ] expected = _postprocess_result(expected) assert result == expected text_for_stopwords = 'de en la kaj al' * 20 result = rake.apply(text_esperanto, text_for_stopwords) result = _postprocess_result(result) expected = [ ('vidpunktoj depende', 4.0), ('sia kompreno', 4.0), ('tiuj principoj', 4.0), ('justaj elektoj', 4.0), ('libera komerco', 4.0), ('okcidenta mondo', 4.0), ('ŝtatan religion', 4.0), ('absolutan monarkion', 4.0), ('didevena rajto', 4.0), ('socia kontrakto', 4.0), ('jura hegemonio', 4.0), ('gazetara libereco', 3.5), ('religia libereco', 3.5), ('privata posedrajto', 3.5), ('libereco', 1.5), ('posedrajto', 1.5), ('ideoj', 1.0), ('egaleco', 1.0), ('civitanrajtoj', 1.0), ('klerismo', 1.0), ('ekonomikistoj', 1.0), ('reĝoj', 1.0), ('vivo', 1.0), ('laŭ', 1.0), ] expected = _postprocess_result(expected) assert result == expected text_numbers = '123, 123, 123, 123' result = rake.apply(text_numbers) assert result == [('123', 0)] rake_min_freq2 = Rake( min_chars=3, max_words=3, min_freq=2, lang_detect_threshold=50, max_words_unknown_lang=2, generated_stopwords_percentile=80, generated_stopwords_max_len=3, generated_stopwords_min_freq=2, ) text_starts_with_stopword = ('and keywords... keywords are the best words') result = rake_min_freq2.apply(text_starts_with_stopword) assert result == [('keywords', 1.0)] with pytest.raises(NotImplementedError): Rake(language_code='xxx') rake_uk = Rake( min_chars=3, max_words=4, min_freq=1, language_code='uk', ) text_en_uk = ( 'Compatibility of systems of linear constraints над the set of ' 'natural numbers. Criteria of compatibility of a system of linear ' 'Diophantine equations, strict inequations, та nonstrict inequations ' 'are considered. Upper bounds для components of a minimal set of ' 'solutions та algorithms of construction of minimal generating sets ' 'of solutions для всіх types of systems are given. Ці criteria та ' 'the corresponding algorithms для constructing a minimal supporting ' 'set of solutions може бути used в solving всіх the considered types ' 'of systems та systems of mixed types.') result = rake_uk.apply(text_en_uk) result = _postprocess_result(result) expected = [ ('minimal set of solutions', 15.6), ('systems of mixed types', 15.6), ('nonstrict inequations are considered', 15.0), ('criteria of compatibility of', 13.7), ('the corresponding algorithms', 9.0), ('components of', 5.6), ('strict inequations', 5.0), ('upper bounds', 4.0), ('criteria', 2.5), ('constructing', 1.0), ('used', 1.0), ('solving', 1.0), ] expected = _postprocess_result(expected) assert result == expected
def keyword_extraction(transcript): rake = Rake(max_words=2, min_freq=2) keywords = rake.apply(transcript) return [item[0] for item in keywords[:5]]
def reviewscore(request): rake = Rake() positive = request.POST['positive'] negative = request.POST['negative'] if len(positive) == 0: positive = "No positive" if len(negative) == 0: #message = 'You searched for: %r' % request.GET['negative'] negative = "No negative" positiveResult = rake.apply(positive) negativeResult = rake.apply(negative) positiveScore = 0 negativeScore = 0 if len(positiveResult) > 0: for i in range(0, len(positiveResult)): positiveScore = positiveScore + positiveResult[i][1] if len(negativeResult) > 0: for i in range(0, len(negativeResult)): negativeScore = negativeScore + negativeResult[i][1] totalScore = positiveScore - negativeScore expectedReviewScore = 0.18 * totalScore + 8.31 # limit expected score range from 0 to 10 if expectedReviewScore > 10.0: expectedReviewScore = 10.0 elif expectedReviewScore < 0.0: expectedReviewScore = 0.0 else: expectedReviewScore = round(expectedReviewScore, 2) # import actual data for actual user score reviewsRawData = pd.read_csv( "../data/Hotel_Reviews.csv", usecols=['Positive_Review', 'Negative_Review', 'Reviewer_Score']) resultTuple = reviewsRawData[ reviewsRawData["Positive_Review"].str.contains(positive)] resultVal = '' actual = ' | This review is not from database' analysis = '' # handle if the review doesn't exist on database if len(resultTuple["Reviewer_Score"].values) > 0: resultVal = resultTuple["Reviewer_Score"].values if resultVal[0] > 0.0: tempVal = copy.deepcopy(resultVal[0]) actual = ' | Actual ' actual = actual + copy.deepcopy(str(resultVal[0])) actual = actual + ' | Accuracy: ' if expectedReviewScore > resultVal[0]: analysis = (expectedReviewScore - resultVal[0]) analysis = str( round(100 - (analysis / expectedReviewScore * 100), 2)) else: analysis = (resultVal[0] - expectedReviewScore) analysis = str(round(100 - (analysis / tempVal * 100), 2)) analysis = analysis + '%' result = "User Rating: Predicted " + str( expectedReviewScore) + actual + analysis return HttpResponse(result)
# keywords = rake.apply(clear_text) # print(keywords) num_lines = open('skills.txt').read().count('\n') bar = IncrementalBar('Обработка', max=num_lines) keywords_hash = {} line_number = 0 with open('skills.txt') as input: for line in input: bar.next() keywords = [] for enum in line.split(','): clear_line = preprocess_text(line) enum_keywords = rake.apply(clear_line) keywords += enum_keywords for keywordData in keywords: keyword, score = keywordData occurence_data = {'score': score, 'text': line.strip()} if keyword in keywords_hash: keywords_hash[keyword].append(occurence_data) else: keywords_hash[keyword] = [occurence_data] input.close() bar.finish() keywords = [{ "normal": keyword, "variants": variants
def keywords(): # entry = { # "text": "La inteligencia artifical es muy interesante. Los elefantes son animales grandes. La ballena es el " # "mamífero más grande del mundo. la jirafa tiene cuello largo. Plutón no es un planeta, " # "Saturno tampoco. Dios es real. Aguante Perón carajo. " stopwords = { 'a', 'á', 'acerca', 'además', 'adonde', 'al', 'algo', 'algún', 'alguna', 'algunas', 'alguno', 'algunos', 'allende', 'ambos', 'amén', 'ampleamos', 'ante', 'antes', 'aquel', 'aquella', 'aquellas', 'aquellos', 'aqui', 'arriba', 'atras', 'aun', 'bajo', 'bastante', 'bien', 'cabe', 'cabo', 'cada', 'cierta', 'ciertas', 'cierto', 'ciertos', 'circa', 'como', 'con', 'conmigo', 'connosco', 'conseguimos', 'conseguir', 'consigo', 'consigue', 'consiguen', 'consigues', 'contigo', 'contra', 'convosco', 'convusco', 'cual', 'cuando', 'de', 'dejante', 'del', 'delas', 'denominada', 'denominadas', 'denominado', 'denominados', 'dentro', 'desde', 'después', 'donde', 'dos', 'durante', 'e', 'el', 'él', 'ella', 'ellas', 'ellos', 'empleais', 'emplean', 'emplear', 'empleas', 'empleo', 'en', 'encima', 'entonces', 'entre', 'era', 'erais', 'eramos', 'éramos', 'eran', 'erar', 'eras', 'eres', 'es', 'esa', 'esas', 'ese', 'eso', 'esos', 'esta', 'está', 'estaba', 'estabais', 'estábamos', 'estaban', 'estabas', 'estad', 'estada', 'estadas', 'estado', 'estados', 'estais', 'estáis', 'estamos', 'estan', 'están', 'estando', 'estar', 'estará', 'estarán', 'estarás', 'estaré', 'estaréis', 'estaremos', 'estaría', 'estaríais', 'estaríamos', 'estarían', 'estarías', 'estas', 'estás', 'este', 'esté', 'estéis', 'estemos', 'estén', 'estés', 'esto', 'estos', 'estoy', 'estuve', 'estuviera', 'estuvierais', 'estuviéramos', 'estuvieran', 'estuvieras', 'estuvieron', 'estuviese', 'estuvieseis', 'estuviésemos', 'estuviesen', 'estuvieses', 'estuvimos', 'estuviste', 'estuvisteis', 'estuvo', 'excepto', 'existente', 'existentes', 'fin', 'fue', 'fuera', 'fuerais', 'fuéramos', 'fueran', 'fueras', 'fueron', 'fuerza', 'fuese', 'fueseis', 'fuésemos', 'fuesen', 'fueses', 'fui', 'fuimos', 'fuiste', 'fuisteis', 'gueno', 'ha', 'habéis', 'haber', 'había', 'habíais', 'habíamos', 'habían', 'habías', 'habida', 'habidas', 'habido', 'habidos', 'habiendo', 'habrá', 'habrán', 'habrás', 'habré', 'habréis', 'habremos', 'habría', 'habríais', 'habríamos', 'habrían', 'habrías', 'hace', 'haceis', 'hacemos', 'hacen', 'hacer', 'haces', 'hacia', 'hago', 'han', 'has', 'hasta', 'hay', 'haya', 'hayáis', 'hayamos', 'hayan', 'hayas', 'haz', 'he', 'hemo', 'hemos', 'hube', 'hubiera', 'hubierais', 'hubiéramos', 'hubieran', 'hubieras', 'hubieron', 'hubiese', 'hubieseis', 'hubiésemos', 'hubiesen', 'hubieses', 'hubimos', 'hubiste', 'hubisteis', 'hubo', 'incluso', 'intenta', 'intentais', 'intentamos', 'intentan', 'intentar', 'intentas', 'intento', 'ir', 'la', 'largo', 'las', 'le', 'les', 'lo', 'los', 'más', 'me', 'mediante', 'menos', 'mi', 'mí', 'mía', 'miar', 'mías', 'mientras', 'mio', 'mío', 'míos', 'mis', 'modode', 'mucho', 'muchos', 'muy', 'na', 'nada', 'ni', 'no', 'nos', 'nosotras', 'nosotros', 'nuestra', 'nuestras', 'nuestro', 'nuestros', 'nunca', 'o', 'os', 'otra', 'otras', 'otro', 'otros', 'pa', 'pa\'', 'par', 'para', 'pero', 'poco', 'podeis', 'podemos', 'poder', 'podria', 'podriais', 'podriamos', 'podrian', 'podrias', 'por', 'porque', 'primero', 'pro', 'puede', 'pueden', 'puedo', 'pues', 'que', 'qué', 'quien', 'quienes', 'sabe', 'sabeis', 'sabemos', 'saben', 'saber', 'sabes', 'salvo', 'se', 'sea', 'seáis', 'seamos', 'sean', 'seas', 'según', 'sentid', 'sentida', 'sentidas', 'sentido', 'sentidos', 'sentir', 'ser', 'será', 'serán', 'serás', 'seré', 'seréis', 'seremos', 'sería', 'seríais', 'seríamos', 'serían', 'serías', 'si', 'sí', 'sido', 'siendo', 'siente', 'sin', 'sintiendo', 'so', 'sobre', 'sois', 'solamente', 'solo', 'somos', 'son', 'soy', 'su', 'sus', 'suya', 'suyas', 'suyo', 'suyos', 'también', 'tanto', 'te', 'tendrá', 'tendrán', 'tendrás', 'tendré', 'tendréis', 'tendremos', 'tendría', 'tendríais', 'tendríamos', 'tendrían', 'tendrías', 'tened', 'teneis', 'tenéis', 'tenemos', 'tener', 'tenga', 'tengáis', 'tengamos', 'tengan', 'tengas', 'tengo', 'tenía', 'teníais', 'teníamos', 'tenían', 'tenías', 'tenida', 'tenidas', 'tenido', 'tenidos', 'teniendo', 'ti', 'tiempo', 'tiene', 'tienen', 'tienes', 'todo', 'todos', 'trabaja', 'trabajais', 'trabajamos', 'trabajan', 'trabajar', 'trabajas', 'trabajo', 'tras', 'tu', 'tú', 'tus', 'tuve', 'tuviera', 'tuvierais', 'tuviéramos', 'tuvieran', 'tuvieras', 'tuvieron', 'tuviese', 'tuvieseis', 'tuviésemos', 'tuviesen', 'tuvieses', 'tuvimos', 'tuviste', 'tuvisteis', 'tuvo', 'tuya', 'tuyas', 'tuyo', 'tuyos', 'ultimar', 'ultimo', 'un', 'un', 'una', 'unas', 'uno', 'unos', 'usa', 'usais', 'usamos', 'usan', 'usar', 'usas', 'utilizando', 'uso', 'va', 'vais', 'valor', 'vamos', 'van', 'vaya', 'verdad', 'verdadera', 'verdadero', 'versus', 'vía', 'vosostras', 'vosostros', 'vosotras', 'vosotros', 'voy', 'vuestra', 'vuestras', 'vuestro', 'vuestros', 'vusco', 'y', 'ya', 'yo', 'optimizando', 'actualmente', 'llevar', 'manera', 'podrán', 'reduciendo', 'brindar' } text_es = request.json['text'] stopwords_list = request.json['stopwords'] stopwords_list_clean = re.sub('[\[\]]', '', stopwords_list).split(", ") stopwords.update(stopwords_list_clean) print(stopwords) html = markdown.markdown(text_es) plain_text = html2text.html2text(html) plain_text = re.sub('[!@·_•*\[\]/#$]', '', plain_text) plain_text = plain_text.replace('\\n', '. ') plain_text = re.sub(r'http\S+', '', plain_text, flags=re.MULTILINE) plain_text = re.sub(r'www\S+', '', plain_text, flags=re.MULTILINE) result = [] # de 2 palabras rake = Rake(language_code='es', max_words=2, generated_stopwords_max_len=20, stopwords=stopwords) key_words_2 = rake.apply(plain_text) keys2 = key_words_2[:20] for key in keys2: item = {"name": key[0], "score": key[1]} result.append(item) # de 1 palabra rake = Rake(language_code='es', max_words=1, generated_stopwords_max_len=20, stopwords=stopwords) key_words_1 = rake.apply(plain_text) keys1 = key_words_1[:50] for key in keys1: if not resultContains(result, key[0]): item = {"name": key[0], "score": key[1]} if len(result) < 40: result.append(item) response = {"keywords": result} print(response) return response
def home(): score = 0 results_score = '' results_feedback = '' reaction = '' temp = '' temp2 = '' terms_form = '' res_good = [] res_bad = [] bad_words = { "proprietary notice language": 0, "reasonable attorneys’ fees": 0, "assume total responsibility": 0, "communication line failure": 0, "attorneys’ fees": 0, "similar fees": 0, "applicable prices": 0, "publicly displayed": 0, "manipulate identifiers": 0, "losses incurred": 0, "injuries caused": 0, "irreparable harm": 0, "computer virus": 0, "apple’s failure": 0, "apple’s control": 0, "governmental request": 0, "out-of-pocket expenses": 0, "oral agreements": 0, "destructive features": 0, "punitive damages": 0, "monetary damages": 0, "third-party applications connected": 0, "re-export control laws": 0, "modified additional terms": 0, "stop providing services": 0, "expressly override": 0, "constantly changing": 0, "non-exclusive license": 0, "remove functionalities": 0, "apply retroactively": 0, "alleged infringing material": 0, "affiliated companies": 0, "manual process": 0, "mail lists": 0, "reverse engineer": 0, "trade secret": 0, "accounting fees": 0, "lost data": 0, "external websites": 0, "fully responsible": 0, "password information": 0, "post advertisements": 0, "conditions waive": 0, "remove communications": 0 } good_words = { "intellectual property rights": 0, "account information secure": 0, "completely private": 0, "good faith": 0, "accessible worldwide": 0, "equitable relief": 0, "relief granted": 0, "competent jurisdiction": 0, "reasonable time": 0, "copyrights rights": 0, "information secure": 0, "apple’s liability": 0, "reasonable advance notice": 0, "party beneficiary rights": 0, "open source license": 0, "open source software": 0, "legal notices displayed": 0, "safety laws": 0, "password confidential": 0, "malware detection": 0, "privacy": 0, "worldwide license": 0, "submit feedback": 0, "reasonable requests assisting": 0, "good faith belief": 0, "limitation security-related features": 0, "legally binding agreement": 0, "license rights granted": 0, "copyright owner's behalf": 0, "license includes access": 0, "royalty-free license": 0, "confidential information": 0 } rake = Rake() if request.method == "POST": terms_form = request.form.get("input") kw = rake.apply(terms_form) for word in kw: if word[0] in good_words: good_words[word[0]] += 1 res_good.append(word[0]) if word[0] in bad_words: bad_words[word[0]] += 1 res_bad.append(word[0]) score = round(len(res_good) / (len(res_good) + len(res_bad)) * 175, 2) results_score = "The score is {}%".format(score) if 0 <= score <= 50: results_feedback = "Poor" elif 50 < score <= 65: results_feedback = "Average" elif 65 < score <= 80: results_feedback = "Good" elif 80 < score <= 100: results_feedback = "Excellent" temp = ", ".join(res_good) temp2 = ", ".join(res_bad) return render_template('index.html', results_score=results_score, results_feedback=results_feedback, reaction=reaction, output=temp, output2=temp2)
generated_stopwords_percentile=80, generated_stopwords_max_len=3, generated_stopwords_min_freq=2, ) docs = [] #to store opened trainFiles for i in range(0,len(trainFiles)): raw = open(trainFiles[i], encoding = "latin-1") txt = raw.read() docs.append(txt) #opening each file in trainFiles and storing it in docs wordsByRake =[] #to store key words from each docs after applying Rake for i in range(0,len(trainFiles)): wordsByRake.append(rake.apply(docs[i])) def listOfLists(lst): temp =[] for i in range(0,len(lst)): if(lst[i][1] > 4.0): #this can be changed according to the required number of tags temp.append(lst[i][0]) return [elem for elem in temp] for i in range(0,len(wordsByRake)): trainDocs.insert(i, listOfLists(wordsByRake[i])) wordsByRake = [] #to avoid memory usage docs = [] #to avoid memory usage nlp = en_core_web_sm.load() #loading the model
ans.append(sentences[idx + 1]) sentences = [] # further refine question and answer sentences del questions[1::2] del ans[1::2] # trim data size to defined size questions = questions[:size] ans = ans[:size] # create keywords for each sentence using Rake key = [] for idx, q in enumerate(questions): keyword = rake.apply(" ".join(re.findall(r"\w+", q.lower()))) if len(keyword) != 0: key.append((keyword[0])[0]) else: key.append((line.split())[0]) try: with open("data.pickle", "rb") as f: words, labels, training, output = pickle.load(f) except: words = [] labels = [] docs_x = [] docs_y = []