Пример #1
0
    def search_web(self):
        """This function takes input as any news topic and search for it in web and return website source and URL
        @Author: Adarsh Koppa Manjunath
        @Parameter:
            topic(str)- News topic user input
        @Return:
        web_result_list(list)- list of URLs"""
        web_result_list = []
        try:
            web_results = search(self.topic,
                                 lang="en",
                                 num=self.num,
                                 stop=self.stop,
                                 pause=1)
            for index in web_results:
                if index.endswith((".pdf", ".docx", ".ppt")):
                    log.debug("URL with extensions ignored-%s" % (index))
                    pass
                else:
                    web_result_list.append(index)
            return web_result_list

        except Exception as e:
            log.error('An exception occurred: {}'.format(e))
            log.error(traceback.format_exc())
            return web_result_list
Пример #2
0
    def thread_func(self):
        """This function takes topic to be searched on web. First step will  be getting the list of URLs and second is thred function to extract content
        @Author: Adarsh Koppa Manjunath
        @Parameter:
        topic(str)- to be searched on web
        @Return:
        final_output(Dict): consists URL as key and Description as value
        """
        try:
            langobj = validation()
            lang = langobj.isEnglish(self.topic)
            if lang == False:
                return "Oops! entered topic is not in English! we support only English language at the moment"

            web_result_list = self.search_web()
            log.debug("List of URLs-%s" % (web_result_list))
            threads = [
                threading.Thread(target=self.extract_content, args=(url, ))
                for url in web_result_list
            ]
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join()
            log.debug("final output from search file-%s" % (self.final_output))
            if self.sentiment != "na":
                self.order_output()

            return self.final_output

        except Exception as e:
            log.error('An exception occurred: {}'.format(e))
            log.error(traceback.format_exc())
            return self.final_output
Пример #3
0
    def sentiment_analysis(self, text):
        """this function used to know the sentiment of text
        @Author: Adarsh Koppa Manjunath
        @Parameter:
            text(list): list of words
        @return:
            sentiment(string): Postive or Negative"""
        try:
            pos_count = 0
            pos_correct = 0
            for word in text:
                analysis = TextBlob(word)
                if analysis.sentiment.polarity > 0:
                    pos_correct += 1
                pos_count += 1
            #log.debug("Positive accuracy = {}% via {} samples".format(pos_correct/pos_count*100.0, pos_count))

            neg_count = 0
            neg_correct = 0

            for word in text:
                analysis = TextBlob(word)
                if analysis.sentiment.polarity < 0:
                    neg_correct += 1
                neg_count += 1
            #log.debug("Negative accuracy = {}% via {} samples".format(neg_correct/neg_count*100.0, neg_count))

            return (int(pos_correct / pos_count * 100.0),
                    int(neg_correct / neg_count * 100.0))

        except Exception as e:
            log.error('An exception occurred: {}'.format(e))
            log.error(traceback.format_exc())
            return "exception failed"
Пример #4
0
    def clean_text(self, text):
        """This function is used to clean text for sentimental analysis
        @Author: Adarsh Koppa Manjunath
        @Parameters:
            text(str): text to be cleaned
        @return
            final_output(dict): url and serach result"""
        try:

            #remove square brackets
            text = re.sub('\[[^]]*\]', '', text)
            #remove digits
            pattern = r'[^a-zA-z0-9\s]'
            text = re.sub(pattern, '', text)
            #steming the text
            ps = nltk.porter.PorterStemmer()
            text = ' '.join([ps.stem(word) for word in text.split()])
            #tokenization and stop words removal
            tokenizer = ToktokTokenizer()
            stopword_list = set(stopwords.words('english'))
            tokens = tokenizer.tokenize(text)
            tokens = [token.strip() for token in tokens]
            filtered_tokens = [
                token for token in tokens if token.lower() not in stopword_list
            ]

            return filtered_tokens

        except Exception as e:
            log.error('An exception occurred: {}'.format(e))
            log.error(traceback.format_exc())
            return "exception: failed"
Пример #5
0
def search(request):
    """This function communicates between template and search_web files"""
    try:
        if request.method=="POST":
            
            form = GeneralForms(request.POST)

            if form.is_valid():
                topic=form.cleaned_data['name']
                sentiment = form.cleaned_data['options']
                search_web_obj=SearchWeb(topic,final_output={},sentiment=sentiment,sentiment_dict={})
                result=search_web_obj.thread_func()

                if type(result)==str:
                    args = {'result': result}

                    return render(request, "alert.html", args)
                else:
                    args = {'result':result,'sentiment':sentiment}
                    return render(request,"results.html", args)


        form = GeneralForms()
        
        
        return render(request,"home.html",{'form': form}) 


    except Exception as e:
        log.error('An exception occurred: {}'.format(e))
        log.error(traceback.format_exc())
Пример #6
0
    def extract_content(self, url):
        """This function takes URL as input and parse it to beautiful soup library to extract content and also clean it at the primary level with basic blacklist filter
        @Source: https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
        @Parameter:
            URL(str)- needs to be extracted
        @Return:
            text(str)- extracted text"""
        try:
            sentiment = ""
            html = urllib.request.urlopen(url).read()
            soup = BeautifulSoup(html, 'html.parser')
            texts = soup.findAll(text=True)
            visible_texts = filter(self.tag_visible, texts)
            text = u" ".join(t.strip() for t in visible_texts)
            if len(text) < 1000:
                return ""

            text = text[3000:9000].split(". ")
            self.final_output[url] = " ".join(sen for sen in text[1:-1])

            text = self.clean_text(str(text))

            if self.sentiment != "na":
                positive, negative = self.sentiment_analysis(str(text))
                log.debug("\n URL:%s \n Positive-%s \n Negative-%s" %
                          (url, positive, negative))
                if sentiment == "negative":
                    self.sentiment_dict[url] = negative
                    log.debug(self.sentiment_dict)
                else:
                    self.sentiment_dict[url] = positive
                    log.debug(self.sentiment_dict)
            else:
                return ""
            return ""

        except Exception as e:
            log.error('An exception occurred: {}'.format(e))
            log.error(traceback.format_exc())
            return sentiment