def search_web(self): """This function takes input as any news topic and search for it in web and return website source and URL @Author: Adarsh Koppa Manjunath @Parameter: topic(str)- News topic user input @Return: web_result_list(list)- list of URLs""" web_result_list = [] try: web_results = search(self.topic, lang="en", num=self.num, stop=self.stop, pause=1) for index in web_results: if index.endswith((".pdf", ".docx", ".ppt")): log.debug("URL with extensions ignored-%s" % (index)) pass else: web_result_list.append(index) return web_result_list except Exception as e: log.error('An exception occurred: {}'.format(e)) log.error(traceback.format_exc()) return web_result_list
def thread_func(self): """This function takes topic to be searched on web. First step will be getting the list of URLs and second is thred function to extract content @Author: Adarsh Koppa Manjunath @Parameter: topic(str)- to be searched on web @Return: final_output(Dict): consists URL as key and Description as value """ try: langobj = validation() lang = langobj.isEnglish(self.topic) if lang == False: return "Oops! entered topic is not in English! we support only English language at the moment" web_result_list = self.search_web() log.debug("List of URLs-%s" % (web_result_list)) threads = [ threading.Thread(target=self.extract_content, args=(url, )) for url in web_result_list ] for thread in threads: thread.start() for thread in threads: thread.join() log.debug("final output from search file-%s" % (self.final_output)) if self.sentiment != "na": self.order_output() return self.final_output except Exception as e: log.error('An exception occurred: {}'.format(e)) log.error(traceback.format_exc()) return self.final_output
def sentiment_analysis(self, text): """this function used to know the sentiment of text @Author: Adarsh Koppa Manjunath @Parameter: text(list): list of words @return: sentiment(string): Postive or Negative""" try: pos_count = 0 pos_correct = 0 for word in text: analysis = TextBlob(word) if analysis.sentiment.polarity > 0: pos_correct += 1 pos_count += 1 #log.debug("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count)) neg_count = 0 neg_correct = 0 for word in text: analysis = TextBlob(word) if analysis.sentiment.polarity < 0: neg_correct += 1 neg_count += 1 #log.debug("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count)) return (int(pos_correct / pos_count * 100.0), int(neg_correct / neg_count * 100.0)) except Exception as e: log.error('An exception occurred: {}'.format(e)) log.error(traceback.format_exc()) return "exception failed"
def clean_text(self, text): """This function is used to clean text for sentimental analysis @Author: Adarsh Koppa Manjunath @Parameters: text(str): text to be cleaned @return final_output(dict): url and serach result""" try: #remove square brackets text = re.sub('\[[^]]*\]', '', text) #remove digits pattern = r'[^a-zA-z0-9\s]' text = re.sub(pattern, '', text) #steming the text ps = nltk.porter.PorterStemmer() text = ' '.join([ps.stem(word) for word in text.split()]) #tokenization and stop words removal tokenizer = ToktokTokenizer() stopword_list = set(stopwords.words('english')) tokens = tokenizer.tokenize(text) tokens = [token.strip() for token in tokens] filtered_tokens = [ token for token in tokens if token.lower() not in stopword_list ] return filtered_tokens except Exception as e: log.error('An exception occurred: {}'.format(e)) log.error(traceback.format_exc()) return "exception: failed"
def search(request): """This function communicates between template and search_web files""" try: if request.method=="POST": form = GeneralForms(request.POST) if form.is_valid(): topic=form.cleaned_data['name'] sentiment = form.cleaned_data['options'] search_web_obj=SearchWeb(topic,final_output={},sentiment=sentiment,sentiment_dict={}) result=search_web_obj.thread_func() if type(result)==str: args = {'result': result} return render(request, "alert.html", args) else: args = {'result':result,'sentiment':sentiment} return render(request,"results.html", args) form = GeneralForms() return render(request,"home.html",{'form': form}) except Exception as e: log.error('An exception occurred: {}'.format(e)) log.error(traceback.format_exc())
def extract_content(self, url): """This function takes URL as input and parse it to beautiful soup library to extract content and also clean it at the primary level with basic blacklist filter @Source: https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python @Parameter: URL(str)- needs to be extracted @Return: text(str)- extracted text""" try: sentiment = "" html = urllib.request.urlopen(url).read() soup = BeautifulSoup(html, 'html.parser') texts = soup.findAll(text=True) visible_texts = filter(self.tag_visible, texts) text = u" ".join(t.strip() for t in visible_texts) if len(text) < 1000: return "" text = text[3000:9000].split(". ") self.final_output[url] = " ".join(sen for sen in text[1:-1]) text = self.clean_text(str(text)) if self.sentiment != "na": positive, negative = self.sentiment_analysis(str(text)) log.debug("\n URL:%s \n Positive-%s \n Negative-%s" % (url, positive, negative)) if sentiment == "negative": self.sentiment_dict[url] = negative log.debug(self.sentiment_dict) else: self.sentiment_dict[url] = positive log.debug(self.sentiment_dict) else: return "" return "" except Exception as e: log.error('An exception occurred: {}'.format(e)) log.error(traceback.format_exc()) return sentiment