Python analyzer 예제들, nightcrawler.apps.services.analyzer Python 예제들

예제 #1

0

파일 보기

파일: updater.py 프로젝트: chanyaz/nightcrawler

    def save_to_analysis(self):
        print("calling save_to_analysis with {}".format(self.fromCountry))
        today = timezone.now()
        dates = date(int(today.year), int(today.month), int(today.day))

        data = newsData.objects.filter(publisher=self.publisher, date=dates)
        analyzerObj = analyzer()
        to_country = (('USA'), ('CHN'), ('KOR'), ('PRK'), ('JPN'))
        dataExport = dict()
        average = 0

        for to_c in to_country:
            for datum in data:
                dataExport = analyzerObj.toRelationship(datum, to_c)
                try:  #if the query already exists. add appropriate ID to the list.
                    check = collectedData.objects.get(
                        fromCountry=self.fromCountry,
                        toCountry=to_c,
                        date=dates)
                    if dataExport['toCheck'] and dataExport[
                            'toID'] not in check.toID:  #when the searched collectedData exists, but the dataExport data has not been entered.

                        check.toCheck = dataExport['toCheck']
                        check.to_num += dataExport['to_num']

                        if check.toID is "":
                            check.toID = dataExport['toID']
                        else:
                            check.toID += " " + dataExport['toID']
                        check.sumcompound += dataExport['compoundSum']
                        check.avgcompound = check.sumcompound / check.to_num

                    check.total_num = analyzerObj.total_articles(
                        self.publisher)  #regardless, always update total_num.
                    check.save()

                except ObjectDoesNotExist:  #when the searched collectedData doesn't exists, and the dataExport data has not been entered.

                    if dataExport['to_num'] is not 0:
                        average = dataExport['compoundSum'] / dataExport[
                            'to_num']
                    else:
                        average = 0
                    update = collectedData.objects.create(
                        fromCountry=self.fromCountry,
                        total_num=analyzerObj.total_articles(self.publisher),
                        to_num=dataExport['to_num'],
                        toCountry=to_c,
                        toCheck=dataExport['toCheck'],
                        toID=dataExport['toID'],
                        sumcompound=dataExport['compoundSum'],
                        avgcompound=average)

예제 #2

0

파일 보기

파일: crawlers.py 프로젝트: alchucam/nightcrawler

    def get_Contents(self):
        # getURL = 'https://newsapi.org/v2/top-headlines?sources=the-new-york-times&apiKey={0}'.format(settings.NYTIMES_API_KEY)
        getURL = 'https://api.nytimes.com/svc/topstories/v2/home.json?api-key={0}'.format(
            settings.NYTIMES_API_KEY)
        r = requests.get(getURL)
        json_data = r.json()
        dataExport = dict()  #export crawled info out for data store
        compoundValue = 0  #sentiment score

        #parse json
        # for article in json_data['articles']: #old api
        for article in json_data['results']:
            title = article['title']
            url = article['url']
            # time = article['publishedAt'][:10] #old api
            time = article['published_date'][:10]

            #parse html
            r2 = requests.get(url)
            soup = BSoup(r2.text, 'html.parser')

            if soup is None:
                continue  #if url is not obtainable, skip to next one

            pDict = dict()  #p tag dictionary
            nonReg = False  #check if the main body of articles has expected class name or non-expected class name (g-...)

            #parse p tag in html

            #one type of article. main body of articels has expected class name (i.e. css-1i0edl6 e2kc3sl0)
            for p in soup.find_all('p'):
                if p.get('class') is not None:
                    pName = " ".join(p.get('class'))
                    freq = pDict.get(pName)
                    if freq is not None:
                        freq += 1
                    else:
                        freq = 1
                    pDict.update({pName: freq})

            #different type of article. main body of articles has 'g-..'
            for p in pDict:
                if 'g-' in p:
                    nonReg = True
                    pDict = dict()  #reset
                    for div in soup.find_all('div'):
                        if div.get('class') is not None:
                            divName = " ".join(div.get('class'))
                            freq = pDict.get(divName)
                            if freq is not None:
                                freq += 1
                            else:
                                freq = 1
                            pDict.update({divName: freq})

            #Figure out the most number of class name;
            #as I'm assuming the most nubmer of class name contain the article contents
            count = 0
            pMost = None
            for p in pDict.keys():
                if pDict[p] >= count:
                    count = pDict[p]
                    pMost = p

            #If there exsits g- class name, main articles are in div tag; otherwise it's in p tag.
            if not nonReg:
                newsContent = soup.find_all("p", class_=pMost)
            else:
                newsContent = soup.find_all("div", class_=pMost)

            #if unable to retrieve the contents, skip the article [safebox]
            if newsContent is None:
                continue

            strContainer = ""  #contents for analysis
            strContainer.encode(encoding='UTF-8', errors='strict')
            for content in newsContent[:]:
                if nonReg:
                    if content.p is None:
                        continue
                    strContainer = strContainer + " " + content.p.text.replace(
                        '\n', '')
                else:
                    strContainer = strContainer + " " + content.text

            #run analysis class methods to obtain sentiment score (compoundValue) and most frequent words (wordFreq)
            analyzerObj = analyzer()
            compoundValue = analyzerObj.senti_Analysis(strContainer)
            wordFreq = analyzerObj.word_freq(strContainer)

            if len(strContainer) < 20:
                isArticle = False
            else:
                isArticle = True

            #get one article
            dataExport.update({
                title: {
                    'url': url,
                    'time': time,
                    'compound': compoundValue,
                    'word_freq': wordFreq,
                    'isArticle': isArticle
                }
            })
        return dataExport

예제 #3

0

파일 보기

파일: crawlers.py 프로젝트: alchucam/nightcrawler

    def xmlParse(self, xmlurl, publisher, keyword):
        r = requests.get(xmlurl)
        soup = BSoup(r.text, 'xml')

        dataExport = dict()
        compoundvalue = 0

        #parse xml
        for item in soup.find_all('item'):

            if publisher is 'yonhap':
                if '(Copyright)' in item.title.string:
                    continue
            title = item.title.string
            url = item.link.string

            if not newsData.objects.filter(
                    title=title).exists():  #if it doesn't already exists.
                #publish date
                if publisher is 'yonhap':
                    time = item.pubDate.string[0:4] + '-' + item.pubDate.string[
                        4:6] + '-' + item.pubDate.string[6:8]
                elif publisher is 'ecns':
                    time = item.pubDate.string[:10]
                elif publisher is 'japantimes':
                    time = datetime.datetime.strptime(
                        item.pubDate.string[5:16],
                        "%d %b %Y").strftime("%Y-%m-%d")

                #parse html
                r2 = requests.get(url)
                soup2 = BSoup(r2.text, 'html.parser')

                if soup2 is None:
                    continue  #if url is not obtainable, skip to next one

                strContainer = ""  #contents for analysis
                strContainer.encode(encoding='UTF-8', errors='strict')

                if publisher is 'yonhap' or publisher is 'ecns':
                    newsContent = soup2.find_all("div", class_=keyword)
                elif publisher is 'japantimes':
                    newsContentfind = soup2.find('div', id=keyword)
                    if newsContentfind is not None:
                        newsContent = newsContentfind.findAll('p')
                    else:
                        continue

                #if article is not obtainable, skip to the next articles
                if newsContent is None:
                    continue

                for content in newsContent:
                    strContainer = strContainer + " " + content.text

                if len(strContainer) < 20:
                    isArticle = False
                else:
                    isArticle = True

                analyzerObj = analyzer()
                compoundValue = analyzerObj.senti_Analysis(strContainer)
                wordFreq = analyzerObj.word_freq(strContainer)

                dataExport.update({
                    title: {
                        'url': url,
                        'time': time,
                        'compound': compoundValue,
                        'word_freq': wordFreq,
                        'isArticle': isArticle
                    }
                })

        return dataExport