def get_phrases(text=''): rake = Rake() rake.extract_keywords_from_text(''.join(text)) phrases = rake.get_ranked_phrases() if len(phrases) >= 5: return phrases[:5] else: return phrases
def phrase(ques): phrase=[] new_list=[] r=Rake() question=ques r.extract_keywords_from_text(question) phrase=r.get_ranked_phrases() for items in phrase: new_list.extend(items.lower().split()) return new_list
qry = """ select b.EXPERIENCE,b.QuestionTitle, a.Preprocessed_Responses TextForAnalysis from [CUS].[t_New_Preprocessed_Responses] a join [CUS].[t_SurveyQuestions_FY18H2GESS_Hierarchy] b on a.questionid=b.QuestionID where b.experience='Meeting' and a.Preprocessed_responses is not null """ cur.execute(qry) rows = cur.fetchall() #cur.execute("TRUNCATE TABLE [CUS].[t1_ProcessingTextRank]") doc = ' '.join(row.TextForAnalysis for row in rows) # # for row in rows: # doc=' '.join(row.TextForAnalysis) r = Rake() r.extract_keywords_from_text(doc) keywords = r.get_ranked_phrases_with_scores() print(keywords) # ## print(row.ID) # keywords=' '.join(extract_key_phrases(row.TextForAnalysis)) # summary=extract_sentences(row.TextForAnalysis) # cur.execute("""INSERT INTO [CUS].[t1_ProcessingTextRank] VALUES(?,?,?)""",(row.ID,keywords,summary)) con.commit() con.close() print("Completed!")
if args["preprocess"] == "thresh": gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] # make a check to see if median blurring should be done to remove # noise elif args["preprocess"] == "blur": gray = cv2.medianBlur(gray, 3) # write the grayscale image to disk as a temporary file so we can # apply OCR to it filename = "{}.png".format(os.getpid()) cv2.imwrite(filename, gray) # load the image as a PIL/Pillow image, apply OCR, and then delete # the temporary file text = pytesseract.image_to_string(Image.open(filename)).encode('utf-8') os.remove(filename) print(text) #get keywords r = Rake() r.extract_keywords_from_text(text.decode('utf-8')) keywords = r.get_ranked_phrases_with_scores() print("\n\nKEYWORDS:\n") for word in keywords: print(word) # show the output images #cv2.imshow("Image", image) #cv2.imshow("Output", gray) #cv2.waitKey(0)
def get_meta_keywords(self): r = Rake() r.extract_keywords_from_text(self.body) return r.get_ranked_phrases()[:10]
def keyword_processing(transcript) -> list: r = Rake(min_length=2, max_length=4) r.extract_keywords_from_text(transcript) ranked_keywords= r.get_ranked_phrases() return ranked_keywords
import operator from rake_nltk import Rake from rake_nltk import Metric import diffbot f = open('finalList.txt', 'r', encoding='utf-8') words = f.read() f.close() stop = words.split('\n') r = Rake(ranking_metric=Metric.WORD_FREQUENCY) url = 'https://www.cnn.com/2020/08/21/politics/peter-rafael-dzibinski-debbins-green-beret-russia/index.html' urlNoNames = 'https://www.britannica.com/science/influenza' json_result = diffbot.article(urlNoNames, token='d656578220cbf622d16575aba331d47d') words = (json_result['objects'][0]['text']) r.extract_keywords_from_text(words) result = r.get_ranked_phrases_with_scores() print(result)
def keywords(x): r = Rake() r.extract_keywords_from_text(x) return ' '.join(list(r.get_word_degrees().keys()))
def extract_key_word_rank(text): r = Rake() r.extract_keywords_from_text(text) print(r.get_ranked_phrases()) print("==============================") print(r.get_word_degrees())
mc_client = base.Client(('127.0.0.1', 11211), allow_unicode_keys=True) with open('results.txt', 'a+') as result_file: for experience in experienceData: experienceText = experience[2] if type(experienceText) == float: continue r = Rake(stopwords=stop_words) # Strip Links experienceText = re.sub(r'^https?://.*[\r\n]*', '', experienceText, flags=re.MULTILINE) r.extract_keywords_from_text(experienceText) phraselist = r.get_ranked_phrases() experienceduration = [] setofskills = [] setofphrases = [] found_skills = get_skills() print(found_skills) for result in found_skills: result_file.write(f"{result}\n") result_file.flush() complete_found_skill_list.append(experienceText) print("==================================================") print(complete_found_skill_list)
def getMovies(title): df = pd.read_csv('Movies.csv') #taking title, genre,plot and actors for recommendation process df = df[['Title','Genre','Director','Actors','Plot']] # cleaning the 3 columns and bringing it to shape df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3]) df['Genre'] = df['Genre'].map(lambda x: x.lower().split(',')) df['Director'] = df['Director'].map(lambda x: x.split(' ')) # merging first and last name to avoid duplicates for index, row in df.iterrows(): row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']] row['Director'] = ''.join(row['Director']).lower() df['Key_words'] = "" for index, row in df.iterrows(): #extracting all unique words from every row and adding to key_words column plot = row['Plot'] r = Rake() r.extract_keywords_from_text(plot) keywordScores = r.get_word_degrees() row['Key_words'] = list(keywordScores.keys()) df.drop(columns = ['Plot'], inplace = True) df.set_index('Title', inplace = True) df['bag_of_words'] = '' columns = df.columns for index, row in df.iterrows(): #creating a BOG model with actor, director, title, plot words = '' for col in columns: if col != 'Director': words = words + ' '.join(row[col]) + ' ' else: words = words + row[col] + ' ' row['bag_of_words'] = words df.drop(columns = [col for col in df.columns if col != 'bag_of_words'], inplace = True) # instantiating and generating the count matrix count = CountVectorizer() count_matrix = count.fit_transform(df['bag_of_words']) titleIndex = pd.Series(df.index) titleIndex[:5] #generating the cosine similarity matrix cosineSimilarityMatrix = cosine_similarity(count_matrix, count_matrix) finalSelections = [] #finding the row where the desired movie is located and taking the highest valus excluding the unit value idx = titleIndex[titleIndex == title].index[0] scoreSeries = pd.Series(cosineSimilarityMatrix[idx]).sort_values(ascending = False) topMovies = list(scoreSeries.iloc[1:11].index) #appending the recommendations in a list for i in topMovies: finalSelections.append(list(df.index)[i]) return finalSelections
def get_text_keyword(text): rake = Rake() rake.extract_keywords_from_text(text=text) return rake.get_ranked_phrases()
def justDoIt(): ############################################### #### Update or verify the following values. ### ############################################### # Replace the subscription_key string value with your valid subscription key. subscription_key = 'd101f6aafa5c44208ead247cfb3d8b32' # Replace or verify the region. # # You must use the same region in your REST API call as you used to obtain your subscription keys. # For example, if you obtained your subscription keys from the westus region, replace # "westcentralus" in the URI below with "westus". # # NOTE: Free trial subscription keys are generated in the westcentralus region, so if you are using # a free trial subscription key, you should not need to change this region. uri_base = 'eastus2.api.cognitive.microsoft.com' headers = { # Request headers. # Another valid content type is "application/octet-stream". 'Content-Type': 'application/octet-stream', 'Ocp-Apim-Subscription-Key': subscription_key, } yaga = os.listdir('/Applications/XAMPP/xamppfiles/htdocs/Uploads/') filename1 = '/Applications/XAMPP/xamppfiles/htdocs/Uploads/' yaga2 = len(yaga) - 1 filename3 = filename1 + yaga[yaga2] print(filename3) #filename2 = '/Applications/XAMPP/xamppfiles/htdocs/Uploads/handnotes3.jpg' k = open(filename3,'rb') body = k.read() k.close() # The URL of a JPEG image containing handwritten text. #body = "{'url':'C:/xampp/htdocs/bigredhax2017/Uploads/handnotes.jpg'}" # For printed text, set "handwriting" to false. params = urllib.urlencode({'handwriting' : 'true'}) try: # This operation requrires two REST API calls. One to submit the image for processing, # the other to retrieve the text found in the image. # # This executes the first REST API call and gets the response. conn = httplib.HTTPSConnection(uri_base) conn.request("POST", "/vision/v1.0/RecognizeText?%s" % params, body, headers) response = conn.getresponse() # Success is indicated by a status of 202. if response.status != 202: # Display JSON data and exit if the first REST API call was not successful. parsed = json.loads(response.read()) print ("Error:") print (json.dumps(parsed, sort_keys=True, indent=2)) conn.close() exit() # The 'Operation-Location' in the response contains the URI to retrieve the recognized text. operationLocation = response.getheader('Operation-Location') parsedLocation = operationLocation.split(uri_base) answerURL = parsedLocation[1] # NOTE: The response may not be immediately available. Handwriting recognition is an # async operation that can take a variable amount of time depending on the length # of the text you want to recognize. You may need to wait or retry this GET operation. #print('\nHandwritten text submitted. Waiting 10 seconds to retrieve the recognized text.\n') time.sleep(10) # Execute the second REST API call and get the response. conn = httplib.HTTPSConnection(uri_base) conn.request("GET", answerURL, '', headers) response = conn.getresponse() data = response.read() # 'data' contains the JSON data. The following formats the JSON data for display. parsed = json.loads(data) print ("Response:") jsonInput = json.dumps(parsed, sort_keys=True, indent=2) conn.close() except Exception as e: print('Error:') print(e) #################################### # This is something which converts a given JSON-string to a better # string for keyword analysis def jsonToTxt(jsonString): substringListOne=jsonString.split('"text": ') substringListTwo=[] stronk="" for sub in substringListOne: sub=sub[1:] i=0 for s in sub: if (s=='\"'): break i+=1 sub=sub[:i] if len(sub)==0: stronk+=sub else: stronk+=sub+" " return stronk stank = jsonToTxt(jsonInput) ########### # This will find the keywords in stank from rake_nltk import Rake r = Rake() a = r.extract_keywords_from_text(stank) b = r.get_ranked_phrases() print(b[0]) return b[0]
os.remove(filename) # print results--------------------------------------------- print(fixed_question) print() print(ocr_a1) print(ocr_a2) print(ocr_a3) print() # Clean up OCR'd question to just keywords r = Rake( ) # Uses stopwords for english from NLTK, and all punctuation characters. r.extract_keywords_from_text(fixed_question) phrases = r.get_ranked_phrases( ) # To get keyword phrases ranked highest to lowest. phrases_clean = (' '.join('"{0}"'.format(w) for w in phrases)) print("Original Question: ", fixed_question) print("Extracted Phrases: ", phrases_clean) print() # google for result count------------------------------------ search = phrases_clean, " +", "\"", ocr_a1, "\"", " -", "\"", ocr_a2, "\"", " -", "\"", ocr_a3, "\"" searchclean1 = ''.join(search) r1 = requests.get("https://www.google.com/search", params={'q': searchclean1})
print(sentence) sentence.draw() from polyglot.text import Text text = Text(text) for sent in text.sentences: print(sent, "\n") for entity in sent.entities: print(entity.tag, entity) with open(r'text file', encoding='utf-8') as file: novels = file.read() print(novels[:106]) import nltk from nltk.corpus import stopwords nltk.download("stopwords") arb_stopwords = set(nltk.corpus.stopwords.words("arabic")) # Get Arabic stopwords and print some of them arb_stopwords = (nltk.corpus.stopwords.words('arabic')) arb_stopwords[:414] rake = Rake(stopwords=stopwords.words('arabic'), punctuations=',./»:،؛":.,’،\''.split(), language='arabic', max_length=3) rake.extract_keywords_from_text(novels) for phrase in rake.get_ranked_phrases()[:24]: print(phrase)
def extract_keywords(text): r = Rake() r.extract_keywords_from_text(text) keywords = r.get_ranked_phrases() return keywords[0:20]
#!/usr/bin/python3 # coding: utf-8 # pip install rake-nltk from rake_nltk import Rake from nltk import tokenize r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters by default ################################################################## ## Extraction given the text. mytext = '''Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types.''' r.extract_keywords_from_text(mytext) print(r.get_ranked_phrases()) # To get keyword phrases ranked highest to lowest. # ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility'] print(r.get_ranked_phrases_with_scores()) # To get keyword phrases ranked highest to lowest with scores. # [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')] ################################################################## ## Extraction given the list of strings where each string is a sentence. r.extract_keywords_from_sentences(tokenize.sent_tokenize(mytext)) print(r.get_ranked_phrases()) # ['linear diophantine equations', 'minimal generating sets', 'minimal supporting set', 'minimal set', 'upper bounds', 'strict inequations', 'nonstrict inequations', 'mixed types', 'corresponding algorithms', 'considered types', 'types', 'considered', 'algorithms', 'used', 'systems', 'system', 'solving', 'solutions', 'given', 'criteria', 'construction', 'constructing', 'components', 'compatibility'] print(r.get_ranked_phrases_with_scores()) # [(9.0, 'linear diophantine equations'), (8.666666666666666, 'minimal generating sets'), (8.166666666666666, 'minimal supporting set'), (5.166666666666666, 'minimal set'), (4.0, 'upper bounds'), (4.0, 'strict inequations'), (4.0, 'nonstrict inequations'), (3.666666666666667, 'mixed types'), (3.5, 'corresponding algorithms'), (3.166666666666667, 'considered types'), (1.6666666666666667, 'types'), (1.5, 'considered'), (1.5, 'algorithms'), (1.0, 'used'), (1.0, 'systems'), (1.0, 'system'), (1.0, 'solving'), (1.0, 'solutions'), (1.0, 'given'), (1.0, 'criteria'), (1.0, 'construction'), (1.0, 'constructing'), (1.0, 'components'), (1.0, 'compatibility')]
if column == 'main_speaker': final_ted[column] = final_ted[column].apply(combine_string) if column in ['title', 'url', 'tags']: continue final_ted[column] = final_ted[column].apply(remove_punctuation) # Distilling TED Talk description down to key words for each talk final_ted['key_words'] = "" for index, row in final_ted.iterrows(): desc = row['description'] # Uses a NLTK Rake object. English stopwords and punctuation removed. rake = Rake() # Extracting key words from TED Talk description rake.extract_keywords_from_text(desc) # Key words and scores for key words dict_keys_scores = rake.get_word_degrees() # assigning the key words to the new column for the corresponding movie row['key_words'] = [row['main_speaker']] + list(dict_keys_scores.keys()) # Removing description column final_ted.drop(columns=['description'], inplace=True) # New data frame with keywords, indexed by title. Converts key_words lists to comma-delimited string keyword_df = final_ted.filter(['key_words']) keyword_df = keyword_df.set_index(final_ted['title']) for i in range(len(keyword_df['key_words'])):
for ss in s1: if ss not in stop_words: c = c + 1 sentence1 = sentence1.lower().split() s2 = word_tokenize(sentence2.lower()) for ss in s2: if ss not in stop_words: c = c + 1 sentence2 = sentence2.lower().split() return model.wmdistance(sentence1, sentence2) / (c * 1.0) def rogue2_bleu(gt, pred): tokens = nltk.word_tokenize(gt) bigramgt = set(nltk.bigrams(tokens)) tokens = nltk.word_tokenize(pred) bigrampred = set(nltk.bigrams(tokens)) return (len(bigramgt.intersection(bigrampred))) / (len(bigramgt) * 1.0), ( len(bigramgt.intersection(bigrampred))) / (len(bigrampred) * 1.0) query = raw_input("Enter query\n") tag = r.extract_keywords_from_text(query) ranked_tags = r.get_ranked_phrases() print 'Quote=' + str(get_quote(query, ranked_tags)) print 'QNA=' + str(get_qna(query, ranked_tags).encode('utf-8'))
from nltk.stem.porter import PorterStemmer from nltk.probability import FreqDist import operator import docx import pandas as pd from rake_nltk import Rake xcel = pd.read_excel('C:\\Users\\kartikaya\\Python_Data\\Article_Search_MindTrust\\FUNDS\\Funds-sectors.xlsx') funds = xcel.iloc[:, 0] tags = xcel.iloc[:, 1] isin = xcel.iloc[:, 2] all_files = os.listdir("C:\\Users\\kartikaya\\Python_Data\\Article_Search_MindTrust\\Articles") file = 'C:\\Users\\kartikaya\\Python_Data\\Article_Search_MindTrust\\Articles\\' + all_files[0] doc = docx.Document(file) full_text = [] for para in doc.paragraphs: full_text.append(para.text) '\n'.join(full_text) r = Rake() r.extract_keywords_from_text(str(full_text)) print(r.get_ranked_phrases_with_scores())
def load_events_for_date(request, date): # get the json with events for a specific date bundle_type = 'medium' limit = 9999 offset = 0 status = 'ongoing' event_list_json = requests.get( f"https://www.blogto.com/api/v2/events/?bundle_type={bundle_type}&date={date}&limit={limit}&offset={offset}&status={status}" ) # convert json received into a python dictionary event_list = json.loads(event_list_json.content) print(event_list) r = Rake(min_length=1, max_length=1) for event_summary in event_list["results"]: event_full_json = requests.get( f"https://www.blogto.com/api/v2/events/{event_summary['id']}") event_full = json.loads(event_full_json.content) print(">>> Event Title:", event_full['title'], "<<<") r.extract_keywords_from_text(event_full["title"]) word_list = r.get_ranked_phrases() try: if event_full["location"]: location, location_created = Location.objects.get_or_create( latitude=event_full["location"]["latitude"], longitude=event_full["location"]["longitude"], defaults={ 'address': event_full['address'], 'city': event_full['city'], 'province': event_full['province'] }) else: location, location_created = Location.objects.get_or_create( latitude=None, longitude=None, city=event_full['city'], defaults={ 'address': event_full['address'], 'province': event_full['province'] }) except Event.MultipleObjectsReturned: print("Duplicate location: " + str(event_full["location"])) try: event_object, event_created = Event.objects.get_or_create( blogto_id=event_full["id"], date=date, defaults={ 'title': event_full["title"], 'description': event_full["description_stripped"], 'image_url': event_full["image_url"] + "?width=1920&height=1080", 'start_time': event_summary["start_time"], 'end_time': event_summary["end_time"], 'venue_name': event_full["venue_name"], 'location': location }) # looping through events, creating a list of keywords, looping through keywords to create keyword object for each # only if word has not been previously created print("List of keywords:") for word in word_list: try: kword, kword_created = Keyword.objects.get_or_create( word=word) event_object.keywords.add(kword) print(word) except Keyword.MultipleObjectsReturned: print("Duplicate keyword") except Event.MultipleObjectsReturned: print("Duplicate event Id: " + str(event_full["id"])) return HttpResponse(f"Loaded events into db for {date}.")
def senti(noOfSearchTerms, topicname): consumer_key = 'g6hWMVoCGEWaYDWg3Km3YaehA' consumer_secret = 'KqMmBhdAsSRcBTO7w18hzBYm4G4BgbfWYHc7lfSmPDUvbCBh4U' access_token_key = '582955639-7fbAiHMKNz4Mizm26Jbcp0yX9mzMa9GhEyYhoXb3' access_token_secret = 'QoQT7WIZVuQ3HD7ipSuA7MxvOxcHA94suGiOeFvVBXE5x' auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token_key, access_token_secret) api = tweepy.API(auth) noOfSearchTerms = noOfSearchTerms topicname = topicname h1 = noOfSearchTerms noOfSearchTerms = noOfSearchTerms + 1000 ### 500 extra tweet so the data can be processed tweets = tweepy.Cursor(api.search, q=topicname, lang="en").items(noOfSearchTerms) unwanted_words = ['@', 'RT', ':', 'https', 'http'] symbols = ['@', '#'] data = [] url = {} n1 = h1 pp1 = 0 pp2 = 0 pp3 = 0 s1 = 0 f = 0 mmm = 0 n = [] pos = [] neg = [] neu = [] posneg = [] r = 0 total = [] ttest = [] urls = [] times = [] for tweet in tweets: z1 = 0 if (s1 == h1 + 1): #for the tweets the user have enter break time = tweet.created_at url = 'https://twitter.com/statuses/' + tweet.id_str text = tweet.text textWords = text.split() u = 0 cleanedTweet = ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT)", " ", text).split()) if (f == 1): while (z1 < s1): ## for the retweets if (n[z1] == cleanedTweet): z1 = z1 + 1 u = 1 else: z1 = z1 + 1 r = 0 if (len(cleanedTweet.split()) > 5 and u == 0): analysis = TextBlob(cleanedTweet) polarity = 'Positive' total.append(cleanedTweet) if (analysis.sentiment.polarity < 0): polarity = 'Negative' n1 = n1 - 1 pp2 = pp2 + 1 neg.append(cleanedTweet) posneg.append(cleanedTweet) ttest.append(3) mmm = mmm + 1 elif (0 <= analysis.sentiment.polarity <= 0.2): polarity = 'Neutral' n1 = n1 - 1 pp1 = pp1 + 1 neu.append(cleanedTweet) posneg.append(cleanedTweet) ttest.append(4) mmm = mmm + 1 else: pos.append(cleanedTweet) posneg.append(cleanedTweet) ttest.append(2) pp3 = pp3 + 1 mmm = mmm + 1 s2 = cleanedTweet s1 = s1 + 1 dic = {} dic['Sentiment'] = polarity dic['Tweet'] = cleanedTweet dic['URL'] = url dic['Time'] = time dic['Sentiment Scores'] = analysis.sentiment data.append(dic) urls.append(url) times.append(time) df = pd.DataFrame(data) df.to_csv('analysis.csv') f = 1 n.append(cleanedTweet) else: salu = 500 print("number of positive tweets are ", pp3) print("number of neutral tweets are ", pp1) print("number of negative tweets are ", pp2) ###bag of words vectorizer = CountVectorizer() salman = vectorizer.fit_transform(posneg) zzz = salman.toarray() ###classfiers for making xtrain ytrain xtest count = 0 xtest = [] xneu = [] xtrain = [] ytrain = [] ytest = [] ytest1 = [] ytest2 = [] while (count < len(ttest)): if (ttest[count] == 2): xtrain.append(zzz[count]) ytrain.append(2) count = count + 1 elif (ttest[count] == 3): xtrain.append(zzz[count]) ytrain.append(3) count = count + 1 elif (ttest[count] == 4): xneu.append(zzz[count]) count = count + 1 ###logrog xtrain1 = np.array(xtrain) clf = linear_model.SGDClassifier(max_iter=1000, shuffle=False, loss='log') clf.fit(xtrain1, ytrain) value = 0 while (value < len(xneu)): um = xneu[value] a = clf.predict([um]) aa = clf.predict_log_proba([um]) hh = aa[0] hhh = hh[1] hhhh = hh[0] q11 = math.exp(hhh) q12 = math.exp(hhhh) if (q11 < q12): mm = q12 - q11 else: mm = q11 - q12 if (mm < 0.1): ytest.append([4]) else: ytest.append(a) value = value + 1 #print("log") #print(len(ytest)) #print(ytest) ### svm xtrain2 = np.array(xtrain) clf1 = SVC(kernel='linear', probability=True) clf1.fit(xtrain2, ytrain) value = 0 while (value < len(xneu)): um = xneu[value] a = clf1.predict([um]) aa = clf.predict_log_proba([um]) hh = aa[0] hhh = hh[1] hhhh = hh[0] q11 = math.exp(hhh) q12 = math.exp(hhhh) if (q11 < q12): mm = q12 - q11 else: mm = q11 - q12 if (mm < 0.1): ytest1.append([4]) else: ytest1.append(a) value = value + 1 #print("svm") #print(len(ytest1)) #print(ytest1) ###naive bayes xtrain3 = np.array(xtrain) clf2 = GaussianNB() clf2.fit(xtrain3, ytrain) value = 0 while (value < len(xneu)): um = xneu[value] a = clf2.predict([um]) ytest2.append(a) value = value + 1 #print(" nb values") #print(ytest2) #print(len(ytest2)) ##for all classifier if two values are same select those classifier finaltest = [] i = 0 length = len(ytest) while (i < length): if (ytest[i] == ytest1[i] and ytest[i] == ytest2[i] and ytest1[i] == ytest2[i]): finaltest.append(ytest[i]) elif (ytest[i] == ytest1[i] and ytest[i] != ytest2[i] and ytest1[i] != ytest2[i]): finaltest.append(ytest[i]) elif (ytest[i] != ytest1[i] and ytest[i] == ytest2[i] and ytest1[i] != ytest2[i]): finaltest.append(ytest[i]) elif (ytest[i] != ytest1[i] and ytest[i] != ytest2[i] and ytest1[i] == ytest2[i]): finaltest.append(ytest1[i]) else: yyyyy = 787878 i = i + 1 # print("finaltest") # print(finaltest) # print(len(finaltest)) ### after classifier the results ff1 = 0 ff2 = 0 ff3 = 0 qq = 0 numb = 0 dic = {} data = [] posafter = [] negafter = [] while (numb < len(n)): cleanedTweet = n[numb] url = urls[numb] time = times[numb] analysis = TextBlob(cleanedTweet) polarity = 'Positive' if (analysis.sentiment.polarity < 0): polarity = 'Negitive' ff1 = ff1 + 1 negafter.append(cleanedTweet) elif (0 <= analysis.sentiment.polarity <= 0.2): if (finaltest[qq] == [4]): polarity = 'neutral' ff3 = ff3 + 1 elif (finaltest[qq] == 3): polarity = 'Neg' ff1 = ff1 + 1 negafter.append(cleanedTweet) elif (finaltest[qq] == 2): polarity = 'pos' ff2 = ff2 + 1 posafter.append(cleanedTweet) qq = qq + 1 else: ff2 = ff2 + 1 posafter.append(cleanedTweet) numb = numb + 1 dic = {} dic['Sentiment'] = polarity dic['Tweet'] = cleanedTweet dic['URL'] = url dic['Time'] = time dic['Sentiment Scores'] = analysis.sentiment data.append(dic) df = pd.DataFrame(data) df.to_csv('analysis2.csv') print("after claffication") print("postive tweets are", ff2) print("negative tweets are", ff1) print("neutral tweets are", ff3) i = 0 text1 = '' while (i < len(posafter)): text1 = text1 + posafter[i] text1 = text1 + ". " i = i + 1 i = 0 text = '' while (i < len(negafter)): text = text + negafter[i] text = text + ". " i = i + 1 #print("positive tweets text") #print(text1) #print("negtive tweets text") #print(text) # Used when tokenizing words sentence_re = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():_`-] # these are separate tokens; includes ], [ ''' lemmatizer = nltk.WordNetLemmatizer() stemmer = nltk.stem.porter.PorterStemmer() #Taken from Su Nam Kim Paper... grammar = r""" NBAR: {<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns NP: {<NBAR>} {<NBAR><IN><NBAR>} # Above, connected with in/of/etc... """ #negative tweets grammer chunker = nltk.RegexpParser(grammar) toks = nltk.regexp_tokenize(text, sentence_re) postoks = nltk.tag.pos_tag(toks) # print(postoks) tree = chunker.parse(postoks) from nltk.corpus import stopwords stopwords = stopwords.words('english') def leaves(tree): """Finds NP (nounphrase) leaf nodes of a chunk tree.""" for subtree in tree.subtrees( filter=lambda t: t.label() == 'NP' ): #for subtree in tree.subtrees(filter = lambda t: t.node=='NP'): yield subtree.leaves() def normalise(word): """Normalises words to lowercase and stems and lemmatizes it.""" word = word.lower() word = stemmer.stem(word) #_word(word) word = lemmatizer.lemmatize(word) return word def acceptable_word(word): """Checks conditions for acceptable word: length, stopword.""" accepted = bool(2 <= len(word) <= 40 and word.lower() not in stopwords) return accepted def get_terms(tree): for leaf in leaves(tree): term = [normalise(w) for w, t in leaf if acceptable_word(w)] yield term terms = get_terms(tree) ''' for term in terms: for word in term: print(word+' ') print('') ''' #postive tweets grammer chunker1 = nltk.RegexpParser(grammar) toks1 = nltk.regexp_tokenize(text1, sentence_re) postoks1 = nltk.tag.pos_tag(toks1) #print("positive grammer") #print(postoks1) tree1 = chunker.parse(postoks1) terms1 = get_terms(tree1) ''' for term in terms1: for word in term: print(word+' ') print('') ''' #rake for keywords for negative tweets r = Rake() # print("cons scores") r.extract_keywords_from_text(text) consscores = r.get_ranked_phrases_with_scores() # print(consscores) p1 = len(consscores) p2 = len(consscores[0]) freshcons = [] i = 0 while (i < p1): p3 = consscores[i][1] p4 = consscores[i][0] h = p3.split() if (len(h) <= 3 and p4 != 1.0): freshcons.append(p3) i = i + 1 # print(freshcons) #rake for keywords for positive tweets # print("pros scores") r.extract_keywords_from_text(text1) proscores = r.get_ranked_phrases_with_scores() #print(proscores) p1 = len(proscores) p2 = len(proscores[0]) freshpros = [] i = 0 while (i < p1): p3 = proscores[i][1] p4 = proscores[i][0] h = p3.split() if (len(h) <= 3 and p4 != 1.0): freshpros.append(p3) i = i + 1 #print(freshpros) ##for converting both grammers to lower case yyy = [[x.lower() for x in line] for line in postoks] yyy1 = [[x.lower() for x in line] for line in postoks1] #rules select grammer==nnp and keyword for negative i = 0 y = 0 conlist = [] p = 0 while (i < len(freshcons)): h = freshcons[i].split() y = 0 while (y < len(h)): if (h[y] == topicname): conlist.append(freshcons[i]) y = y + 1 break else: y = y + 1 i = i + 1 k = 0 while (k < len(freshcons)): h = freshcons[k].split() p = 0 while (p < len(yyy)): if h[0] in yyy[p]: if (len(h) == 3): if (yyy[p][1] == 'nnp' or yyy[p + 1][1] == 'nnp' or yyy[p + 2][1] == 'nnp'): conlist.append(freshcons[k]) break elif (len(h) == 2): if (yyy[p][1] == 'nnp' or yyy[p + 1][1] == 'nnp'): conlist.append(freshcons[k]) break elif (len(h) == 1): if (yyy[p][1] == 'nnp'): conlist.append(freshcons[k]) break else: pppp = 555555 p = p + 1 else: p = p + 1 k = k + 1 #print("negative tweets key words") #print(freshcons) print("conlist") print(conlist) #rules select grammer==nnp and keyword for positive i = 0 y = 0 prolist = [] p = 0 while (i < len(freshpros)): h = freshpros[i].split() y = 0 while (y < len(h)): if (h[y] == topicname): prolist.append(freshpros[i]) y = y + 1 break else: y = y + 1 i = i + 1 k = 0 while (k < len(freshpros)): h = freshpros[k].split() p = 0 while (p < len(yyy1)): if h[0] in yyy1[p]: if (len(h) == 3): if (yyy1[p][1] == 'nnp' or yyy1[p + 1][1] == 'nnp' or yyy1[p + 2][1] == 'nnp'): prolist.append(freshpros[k]) break elif (len(h) == 2): if (yyy1[p][1] == 'nnp' or yyy1[p + 1][1] == 'nnp'): prolist.append(freshpros[k]) break elif (len(h) == 1): if (yyy1[p][1] == 'nnp'): prolist.append(freshpros[k]) break else: pppp = 555555 p = p + 1 else: p = p + 1 k = k + 1 #print("positive tweets key words") #print(freshpros) print("prolist") print(prolist) fh = open("output.txt", "w+") pros = open("proslist.txt", "w+") cons = open("conslist.txt", "w+") fh.write(str(pp3)) fh.write("\n" + str(pp2)) fh.write("\n" + str(pp1)) fh.write("\n" + str(ff2)) fh.write("\n" + str(ff1)) fh.write("\n" + str(ff3)) cons.write("\n" + str(conlist)) pros.write("\n" + str(prolist)) fh.close() pros.close() cons.close() # myAPI = "http://localhost/Twitter/set_return.php?PP3="+str(pp3)+"&PP2="+str(pp2)+"&PP1="+str(pp1)+"&ff2="+str(ff2)+"&ff1="+str(ff1)+"&ff3="+str(ff3)+"&conlist="+str(conlist)+"&prolist="+str(prolist) # print(myAPI) return pp3, pp2, pp1, ff2, ff1, ff3, conlist, prolist
def summarize_doc(content, length): r = Rake() r.extract_keywords_from_text(content) # summarized = r.get_ranked_phrases_with_scores() summarized = ' '.join(r.get_ranked_phrases()).split(' ')[:length] return summarized
def this_filter(self, text, top=100): r = Rake() r.extract_keywords_from_text(text) phrases = r.get_ranked_phrases() most_common = phrases[0:top] return most_common
def extract_summary_and_keywords_from_pdf(articles_dict): for article_key, article_value in articles_dict.items(): link = article_value with open(link, "rb") as f: pdf = pdftotext.PDF(f) article_text = "" for p in pdf: article_text += p raw_data = article_text # Removing Square Brackets and Extra Spaces article_text = re.sub(r'\[[0-9]*\]', ' ', article_text) article_text = re.sub(r'\s+', ' ', article_text) # Removing special characters and digits formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text) formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text) sentence_list = nltk.sent_tokenize(article_text) stopwords = nltk.corpus.stopwords.words('english') word_frequencies = {} for word in nltk.word_tokenize(formatted_article_text): if word not in stopwords: if word not in word_frequencies.keys(): word_frequencies[word] = 1 else: word_frequencies[word] += 1 try: maximum_frequncy = max(word_frequencies.values()) except Exception as e: continue for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word] / maximum_frequncy) sentence_scores = {} for sent in sentence_list: for word in nltk.word_tokenize(sent.lower()): if word in word_frequencies.keys(): if len(sent.split(' ')) < 30: if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word] else: sentence_scores[sent] += word_frequencies[word] import heapq summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get) summary = ' '.join(summary_sentences) # Removing Square Brackets and Extra Spaces summary = re.sub(r'\[[0-9]*\]', ' ', summary) summary = re.sub(r'\s+', ' ', summary) # Removing special characters and digits summary = re.sub('[^a-zA-Z]', ' ', summary) summary = re.sub(r'\s+', ' ', summary) print(summary) print("----------------------------------------") print("----------------------------------------") print("----------------------------------------") print("----------------------------------------") from rake_nltk import Rake r = Rake() r.extract_keywords_from_text(raw_data) keywords_yay = r.get_ranked_phrases() print(keywords_yay) print("----------------------------------------") update_database_from_pdf(article_key, link, keywords_yay, summary)
inputfolderpath2 = "hdfs://richmond:53001/SampleInputs/keyword_input.csv" schema2 = StructType([ \ StructField("Keyword", StringType(), True), \ StructField("RowId & Score", StringType(), True)]) inputfileRDD = sqlContext.read.format('com.databricks.spark.csv') \ .options(header='true', inferschema='true', sep=",", multiLine = True, quote='"', escape='"') \ .load(inputfolderpath2, schema = schema2).rdd.repartition(30) textinputfile="/s/chopin/k/grad/deotales/Source-Recommendation-System/ExampleRun/diff_input.txt" file1 = open(textinputfile,"r") text = file1.read() # text = str(text.encode('ascii', "ignore")) file1.close() rake = Rake() rake.extract_keywords_from_text(text) keyphrases_w_scores = rake.get_ranked_phrases_with_scores() keyphrases_w_scores = keyphrases_w_scores[0:len(keyphrases_w_scores)/2] keyphrases = rake.get_ranked_phrases() inputfileRDD = inputfileRDD\ .flatMap(lambda row: match_phrases(row[0], row[1]))\ .flatMap(lambda row: map_scored_ids(row[0], row[1]))\ .reduceByKey(lambda a, b: (float(a))+(float(b)))\ .top(15, key=lambda x: x[1]) # print(inputfileRDD.count()) id_list_w_scores = inputfileRDD print(id_list_w_scores) id_list = [x[0] for x in id_list_w_scores] print(id_list)
# Skip header row next(rows) for row in rows: # Extract value from spreadsheet and save to variable db_id = row[0].value rs_num = row[1].value description = row[2].value r = Rake( min_length=2, max_length=3 ) # Uses stopwords for english from NLTK, and all puntuation characters. soup = BeautifulSoup(description, 'html.parser') #print(soup.get_text()) r.extract_keywords_from_text(soup.get_text()) keywords = r.get_ranked_phrases( ) # To get keyword phrases ranked highest to lowest. #print(r.get_ranked_phrases_with_scores()) worksheet.write(row1, col, db_id) worksheet.write(row1, col + 1, rs_num) worksheet.write(row1, col + 2, str(keywords)) row1 += 1 workbook.close() print('Spreadsheet Generated')
director = director.replace(" ","") director = director.replace("-","") director = director.replace(".","") director = director.lower() director = director.split(',') attributes.extend(director) actor = row['Actors'] actor = actor.replace(" ","") actor = actor.replace("-","") actor = actor.replace(".","") actor = actor.lower() actor = actor.split(',') attributes.extend(actor) plot = row['Description'] r = Rake() r.extract_keywords_from_text(plot) key_words_dict_scores = r.get_word_degrees() attributes.extend(key_words_dict_scores.keys()) finalatt = list(attributes) attr = ' '.join(finalatt) row['Attributes'] = attr attributes.clear() #Vectorization count = CountVectorizer() count_matrix = count.fit_transform(df['Attributes']) cosine_sim = cosine_similarity(count_matrix, count_matrix) indi = pd.Series(df.Title) indices = indi
# In[3]: # create document matrix for manually classified data classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] processed_documents = [] for _class in classes: query = "SELECT `title` FROM articles WHERE class = \"" + str( _class) + "\" AND sr_no < 901" cursor.execute(query) article_fetch = cursor.fetchall() processed_string = "" for article in article_fetch: r = Rake(language='english') r.extract_keywords_from_text(article[0]) tags = r.get_ranked_phrases() for tag in tags: tokens = set(word_tokenize(tag)) for token in tokens: curr_tag = stemmer.stem(token) if curr_tag not in processed_string: processed_string += curr_tag + " " processed_documents.append(processed_string) # In[3]: processed_documents = [] processed_string = "" deaths = "killed beaten death burned mortal" r = Rake(language='english')
'director', 'country', 'date_added', 'release_year', 'duration', 'Unnamed: 12', 'show_id', 'rating' ], inplace=True, axis=1) #netflix_data.isna().sum() netflix_data.dropna(inplace=True) netflix_data['Key_words'] = "" netflix_data['Type'] = "" for index, row in netflix_data.iterrows(): description = row['description'] r = Rake() r.extract_keywords_from_text(description) key_words_dict_scores = r.get_word_degrees() row['Key_words'] = list(key_words_dict_scores.keys()) netflix_data.drop(columns=['description'], inplace=True) netflix_data['listed_in'] = netflix_data['listed_in'].map( lambda x: x.lower().split(',')) netflix_data['cast'] = netflix_data['cast'].map(lambda x: x.split(',')[:3]) # netflix_data['director'] = netflix_data['director'].map(lambda x: x.split(',')) #netflix_data netflix_data.set_index('title', inplace=True) #netflix_data.head() netflix_data['bag_of_words'] = ''
def Main(request): search_form = Search(request.POST) if search_form.is_valid(): query= request.POST['search_form'] try: from googlesearch import search except ImportError: print("No module named 'google' found") all_key_words = [] results= [] images= [] request.session['search']=random.randint(0,100000) for result in search(query, tld="COM", num=10, stop=10, pause=2): # Ignore converting links from HTML #url=result print(result) results.append(result) response = Request(result, headers={'User-Agent': 'Mozilla/5.0'}) webContent = urlopen(response).read() h= html2text.HTML2Text() h.ignore_links= True h.ignore_images= True text= h.handle(unidecode(str(webContent,errors='ignore'))) #print(re.sub('[*#@$-]','', unidecode(text))) print('hello1') # nlp text = re.sub("[^0-9a-zA-Z]+"," ",text)# removes non-alphanumeric characters r = Rake(max_length = 2,ranking_metric=Metric.WORD_DEGREE) r.extract_keywords_from_text(text) key_words = r.get_ranked_phrases() # print('key_words='+str(key_words)) try: all_key_words.append(key_words[0]) # num of keywords per query except: pass print('hello2') print("all_kws",all_key_words) # takes two most relevent terms for each #image search d = webdriver.Chrome(executable_path='/home/alisher/Desktop/Projects/IB/Drivers/chromedriver') def waits(time,xpath): try: element = WebDriverWait(d, time).until( EC.presence_of_element_located((By.XPATH, xpath)) ) except: print("error occured") pass for num,word in enumerate(all_key_words): d.get('https://duckduckgo.com/?q='+word+'&t=h_&iax=images&ia=images') waits(3,'/html/body/div[2]/div[3]/div/div/div[2]/div/div[1]/div[1]/span/img') img= d.find_element_by_xpath('/html/body/div[2]/div[3]/div/div/div[2]/div/div[1]/div[1]/span/img') src= img.get_attribute('src') images.append(src) display={} for i in range(10): try: display[results[i]]=images[i] except: pass print(display) d.close() print(results) print(images) return render(request, 'results.html', {'display':display}) return render(request, 'home.html', {'form': search_form})
for filename in glob.glob(os.path.join(dataDirectory, '*.docx')): print(filename) filenames.append(filename) for filename in filenames: print("Reading file " + str(fileIndex) + " of " + str(len(filenames))) fileIndex += 1 totalDescription += getText(filename) r = Rake() print("Extracting keywords...") r.extract_keywords_from_text(totalDescription) print("Getting ranked phrases") keywords = r.get_ranked_phrases_with_scores() df = pd.DataFrame(columns=['rank', 'keyword_set']) for pair in keywords: num = (len(df) + 1) df.loc[num] = pair dirtitle = 'KeywordExtraction.csv' df.to_csv(dirtitle, encoding='utf-8')
'arabic.tagger', 'stanford-postagger-full-2018-10-16/stanford-postagger.jar') for tag in tagger.tag(text.split()): print(tag[1]) parser = SParse.StanfordParser( model_path='edu/stanford/nlp/models/lexparser/arabicFactored.ser.gz') sentences = parser.raw_parse_sents(text.split('.')) for line in sentences: for sentence in line: print(sentence) sentence.draw() ner = Text(text) for sent in ner.sentences: print(sent) for entity in sent.entities: print(entity.tag, entity) print('') with open('ar_london.txt', encoding='utf-8') as f: london = f.read() print(london[:100]) rake = Rake(stopwords=stopwords.words('arabic'), punctuations=',./:،؛":.,’\''.split(), language='arabic', max_length=15) rake.extract_keywords_from_text(london) for phrase in rake.get_ranked_phrases()[:5]: print(phrase)