def _get_question_subject(question, doc_name): question_subject = keywords.extract_keywords(NLP, question, 1) if _matches_main_doc_subject(question_subject, doc_name): question_subject = doc_name elif "complication" in question_subject or "complication" in question: question_subject = "complications of " + doc_name return question_subject
def _enrich_qna(qnadoc): allwords = "" for qna_pair in qnadoc.qnaList: qna_pair.add_metadata( "keywords", keywords.extract_keywords(NLP, qna_pair.answer, 5)) subject = _get_question_subject(qna_pair.questions[0], qnadoc.name.lower()) qna_pair.add_metadata("subject", subject) allwords += qna_pair.questions[0] allwords += qna_pair.answer qnadoc.add_metadata("keywords", keywords.extract_keywords(NLP, allwords, 10)) return qnadoc
def retrieve_keywords(): try: txtfile = glob.glob( os.path.join(app.config['SCRAPE_OUTPUT_FOLDER'], flask.session['sid'], '*', '*.txt'))[0] results = extract_keywords(txtfile, dict(flask.session)) except Exception as e: return "PDFMAGIC_ERROR: This item is not currently available" return flask.jsonify(results)
def analyse_sentence(sentence): '''Takes a tweet and performs sentiment analysis on the given tweet, then gives the weight that was returned from the sentiment analysis TODO: Is this function neccesary? HALF-DEPRECATED''' sentiment = analyse_sentiment(sentence) keywordtuples = extract_keywords(sentence) return [(keyword,sentiment*weight) for (keyword,weight) in keywordtuples]
def get_results(): text = request.form['text'] sentences = request.form['sentences_count'] is_summary = 'summary' in request.form # print(request.form) # print('is', is_summary) if is_summary: return render_template('summary.html', summary=summarize(text, int(sentences))) return render_template('keywords.html', keywords=extract_keywords(text, int(sentences)))
def analyse_sentences_var_1(sentences): '''Does analysis of all sentences and returns a compilation of all results in the form of two lists in the magical and fantastical format we all know and love. ...''' hatekeywords = {} lovekeywords = {} for sentence in sentences: sentiment = analyse_sentiment(sentence) for (keyword, weight) in extract_keywords(sentence): a = lovekeywords if sentiment > 0.0 else hatekeywords # choose where to put keyword a[keyword] = a.get(keyword, 0.0) + weight*abs(sentiment) # only positive weights in end result return (lovekeywords.items(), hatekeywords.items())
def extract_keywords_test(filename, keywords_count): text = get_text_from_file(filename) result = extract_keywords(text, keywords_count) print(result) print(len(result))
def main(dom_choice, domain_list): if (dom_choice > len(domain_list)): print "Wrong choice" return "Wrong choice" domain = domain_list[dom_choice - 1] f = open("../datasets/Brands/" + domain.lower() + ".pickle", 'rb') object_file = pickle.load(f) prodslist = {} c = 0 brandslist = {} prodslist = {} for brand in object_file.keys(): #brand.append(line.split('|')[0]) brandslist[c + 1] = brand print str(c + 1) + ". " + brand + "\n" c += 1 print "Enter your choice" ch = int(raw_input()) #ch=ch-1 selectedBrand = brandslist[ch] print selectedBrand c = 0 for prods in range(len(object_file[selectedBrand])): for prod in object_file[selectedBrand][prods].keys(): prodslist[c + 1] = object_file[selectedBrand][prods][prod] print str(c + 1) + ". " + prod + "\n" c += 1 print "Enter your choice" ch = int(raw_input()) #ch=ch-1 print "1.Summary using Text Rank" print "2.Summary using TF-IDF" print "Enter your choice" choice = int(raw_input()) summary = "" if choice == 1: print "Do you want to enable debugging (Y/N)?" ch_debug = raw_input().lower() if ch_debug == "y" or ch_debug == "yes": rankedText = TextRank.summaryGen(prodslist[ch], domain, debugging=True) else: rankedText = TextRank.summaryGen(prodslist[ch], domain) f.close() sleep(3) #rankedText=rankedText[:len(rankedText)/3] if choice == 2: print "Do you want to enable debugging (Y/N)?" ch_debug = raw_input().lower() print "Do you want to enter the token size (Y/N)?" ch_token = raw_input().lower() if ch_debug == "y" or ch_debug == "yes": if ch_token == "y" or ch_token == "yes": print "Enter token size" token = int(raw_input()) rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, gram=token, debug=True) else: rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, debug=True) else: if ch_token == "y" or ch_token == "yes": print "Enter token size" token = int(raw_input()) rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, gram=token) else: rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain) keys = keywords.extract_keywords(domain, prodslist[ch]) rankedSummary = "" for i in range(len(rankedText)): rankedSummary += rankedText[i] stopwords = load_stop_words("../stoplist.txt") tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE) tokens = tokenizer.tokenize(rankedSummary) tokens = [token for token in tokens if token.lower() not in stopwords] precision = float(len(set(tokens).intersection(set(keys)))) / float( len(tokens)) recall = float(len(set(tokens).intersection(set(keys)))) / float(len(keys)) fmeasure = 2 * (precision * recall) / (precision + recall) print "\n\n" print "Precision =", precision print "Recall =", recall print "F-Measure =", fmeasure
def main(dom_choice, domain_list): if (dom_choice > len(domain_list)): print("Wrong choice") return "Wrong choice" domain = domain_list[dom_choice - 1] object_file = load(BRANDS_PARSED_PATH + '/' + domain.lower() + ".npz", allow_pickle=True) object_file = object_file['arr_0'].tolist() print(object_file) prodslist = {} c = 0 brandslist = {} prodslist = {} for brand in object_file.keys(): brandslist[c + 1] = brand print(str(c + 1) + ". " + brand + "\n") c += 1 print("Enter your choice") ch = int(input()) selectedBrand = brandslist[ch] print(selectedBrand) c = 0 for prods in range(len(object_file[selectedBrand])): for prod in object_file[selectedBrand][prods].keys(): prodslist[c + 1] = object_file[selectedBrand][prods][prod] print(str(c + 1) + ". " + prod + "\n") c += 1 print("Enter your choice") ch = int(input()) print("1.Summary using Text Rank") print("2.Summary using TF-IDF") print("Enter your choice") choice = int(input()) summary = "" if choice == 1: print("Do you want to enable debugging (Y/N)?") ch_debug = input().lower() if ch_debug == "y" or ch_debug == "yes": rankedText = TextRank.summaryGen(prodslist[ch], domain, debugging=True) else: rankedText = TextRank.summaryGen(prodslist[ch], domain) sleep(3) if choice == 2: print("Do you want to enable debugging (Y/N)?") ch_debug = input().lower() print("Do you want to enter the token size (Y/N)?") ch_token = input().lower() if ch_debug == "y" or ch_debug == "yes": if ch_token == "y" or ch_token == "yes": print("Enter token size") token = int(input()) rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, gram=token, debug=True) else: rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, debug=True) else: if ch_token == "y" or ch_token == "yes": print("Enter token size") token = int(input()) rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain, gram=token) else: rankedText = TFIDFSummary.summaryGen(prodslist[ch], domain) keys = keywords.extract_keywords(domain, prodslist[ch]) rankedSummary = "" for i in range(len(rankedText)): rankedSummary += rankedText[i] stopwords = load_stop_words("../stoplist.txt") tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE) tokens = tokenizer.tokenize(rankedSummary) tokens = [token for token in tokens if token.lower() not in stopwords] precision = float(len(set(tokens).intersection(set(keys)))) / float( len(tokens)) recall = float(len(set(tokens).intersection(set(keys)))) / float(len(keys)) fmeasure = 2 * (precision * recall) / (precision + recall) print("\n\n") print("Precision =", precision) print("Recall =", recall) print("F-Measure =", fmeasure)
def parse_custom(domain="cellphones"): global fakeness brands = load(BRANDS_PARSED_PATH + '/' + domain.lower() + '.npz', allow_pickle=True) brands = brands['arr_0'].tolist() prodslist = {} c = 0 brandslist = {} prodslist = {} for brand in brands.keys(): brandslist[c + 1] = brand print(str(c + 1) + ". " + brand + "\n") c += 1 print("Enter your choice") ch = int(input()) selectedBrand = brandslist[ch] c = 0 for prods in range(len(brands[selectedBrand])): for prod in brands[selectedBrand][prods].keys(): prodslist[c + 1] = brands[selectedBrand][prods][prod] print(str(c + 1) + ". " + prod + "\n") c += 1 print("Enter your choice") ch = int(input()) review = "" print("Enter your review") while True: try: line = input("") except EOFError: break review += line print("\nPlease wait a moment. Processing the result...\n") keywords_list = keywords.extract_keywords(domain, prodslist[ch]) stopwords = nltk.corpus.stopwords.words() brandslist = [] for i in brands.keys(): brand_words = i.split() brandslist += brand_words for j in brands[i]: title_words = j.items()[0][0].split() brandslist += title_words brandslist = [token for token in brandslist if token not in stopwords] brandslist = set(brandslist) vocabulary = [] reviewList = [] stopwords = nltk.corpus.stopwords.words() tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE) text = nltk.word_tokenize(review) cnt = 0 keyword_cnt = 0 for i in text: if i in brandslist: cnt += 1 if i in keywords_list: keyword_cnt += 1 pos_tagged = nltk.pos_tag(text) analyze_text = TextBlob(review) counts = Counter(tag for word, tag in pos_tagged) caps = len(filter(lambda x: x in string.uppercase, review)) review_status = [0 for i in range(7)] review_data = [0 for i in range(7)] if len(review) != 0: c = Counter(c for c in review if c in ["?", "!"]) review_data[0] = float(counts['PRP$']) / float(len(analyze_text.words)) review_data[1] = analyze_text.subjectivity review_data[2] = float(caps) / len(review) review_data[3] = float(c["?"] + c["!"]) / len(review) review_data[4] = float(len(analyze_text.words)) / 1000 review_data[5] = float(cnt) / float(len(analyze_text.words)) review_data[6] = float(keyword_cnt) / float(len(analyze_text.words)) if review_data[0] > 0: review_status[0] = 1 if review_data[1] < 0.5: review_status[1] = 1 if review_data[2] >= 0.5: review_status[2] = 1 if review_data[3] >= 0.1: review_status[3] = 1 if review_data[4] <= 0.135: review_status[4] = 1 if review_data[5] >= 0.5 or review_data[5] <= 0.1: review_status[5] = 1 if review_data[6] < 0.5: review_status[6] = 1 detection_counter = collections.Counter(review_status) deceptive_level = (float(detection_counter[1]) / 7) * 100 fakeness = deceptive_level return review_data
if ch_debug=="y" or ch_debug=="yes": if ch_token=="y" or ch_token=="yes": print "Enter token size" token=int(raw_input()) rankedText=TFIDFSummary.summaryGen(prodslist[ch],domain,gram=token,debug=True) else: rankedText=TFIDFSummary.summaryGen(prodslist[ch],domain,debug=True) else: if ch_token=="y" or ch_token=="yes": print "Enter token size" token=int(raw_input()) rankedText=TFIDFSummary.summaryGen(prodslist[ch],domain,gram=token) else: rankedText=TFIDFSummary.summaryGen(prodslist[ch],domain) keys=keywords.extract_keywords(domain,prodslist[ch]) rankedSummary="" for i in range(len(rankedText)): rankedSummary+=rankedText[i] stopwords=load_stop_words("../stoplist.txt") tokenizer = RegexpTokenizer("[\w']+", flags=re.UNICODE) tokens = tokenizer.tokenize(rankedSummary) tokens = [token for token in tokens if token.lower() not in stopwords] precision = float(len(set(tokens).intersection(set(keys))))/float(len(tokens)) recall = float(len(set(tokens).intersection(set(keys))))/float(len(keys)) fmeasure = 2*(precision*recall)/(precision+recall) print "\n\n" print "Precision =",precision print "Recall =",recall print "F-Measure =",fmeasure