Пример #1
0
    def saveDB(self, data, source):
        try:
            if len(data['content']) > 500:
                data1 = urllib.parse.urlencode({
                    'source':
                    source,
                    'sourceUrl':
                    data['sourceUrl'],
                    'title':
                    data['title'],
                    'authorName':
                    data['authorName'],
                    'releaseTime':
                    data['releaseTime'],
                    'content':
                    data['content'],
                    'img':
                    data['img'],
                    'jobId':
                    data['jobId']
                })
                data2 = data1.encode('utf-8')
                re = urllib.request.urlopen(url=self.post_url, data=data2)
                re.read().decode('utf-8')
            else:
                pass

        except BaseException as e:
            NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())),
                                  'SavePostSql\t', str(e))
Пример #2
0
    def saveMySql(self,
                  data,
                  tableName=None,
                  host='localhost',
                  port=3306,
                  user='******',
                  password='******',
                  db='crawler',
                  charset='utf8',
                  *args):
        # mysql 入库
        conn = pymysql.Connect(host=host,
                               user=user,
                               password=password,
                               port=port,
                               database=db,
                               charset=charset)
        cursor = conn.cursor()
        try:
            sql = 'insert into news(sourceUrl,title,authorName,releaseTime,content,img) VALUES (%s,%s,%s,%s,%s,%s)'

            cursor.execute(sql, [
                data['sourceUrl'], data['title'], data['authorName'],
                data['releaseTime'], data['content'], data['img']
            ])
            conn.commit()

        except BaseException as e:
            NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())),
                                  'SaveMySql\t', str(e))
Пример #3
0
 def mainNewsBreak(self):
     n = NewsFeeds()
     s = SaveSqlDb()
     nb = NewsBreak()
     url_nb = 'http://api.particlenews.com/Website/channel/news-list-for-best-channel?cstart=0&infinite=true&refresh=1&epoch=5&distribution=newsbreak&platform=1&cv=4.7.3&cend=10&appid=newsbreak&weather=true&fields=docid&fields=date&fields=image&fields=image_urls&fields=like&fields=source&fields=title&fields=url&fields=comment_count&fields=fb_share_total&fields=coach_mark_text&fields=up&fields=down&fields=summary&fields=favicon_id&fields=dominant_image&fields=contextMeta&fields=video_urls&fields=viewType&push_refresh=0&modularize=true&ts=2019-04-07+18%3A14%3A01+%2B0800&version=020025&net=wifi'
     docId = nb.parsingPost(url=url_nb)
     get_url = 'http://api.particlenews.com/Website/contents/content?related_docs=false&cv=4.7.3' \
               '&docid=' + docId + \
               '&appid=newsbreak&bottom_channels=false&distribution=newsbreak&platform=1&version=020025&net=wifi'
     news_list = nb.parsingGet(url=get_url)
     path = '/data/crawler'
     pic_path = '/newsbreak/picture/'
     number = 1
     for url in news_list:
         if url not in self.have_met:
             self.have_met.add(url)
             data = n.parsingUrl(url=url,
                                 downloadPath=path,
                                 picPath=pic_path)
             if data is None:
                 pass
             else:
                 print('NB_detail_url\t', url)
                 print('NB_number\t', number)
                 number += 1
                 if data['releaseTime'] is None or data['releaseTime'] == '':
                     data['releaseTime'] = str(self.point_time)
                 if self.post_DB:
                     s.saveDB(data=data, source=2)
                 else:
                     s.saveMySql(data=data)
         else:
             pass
Пример #4
0
 def mainSmartNews(self):
     n = NewsFeeds()
     s = SaveSqlDb()
     sm = SmartNews()
     news_list = sm.smartNews()
     path = '/data/crawler'
     pic_path = '/smartNews/picture/'
     number = 1
     for new in news_list:
         if new not in self.have_met:
             self.have_met.add(new)
             data = n.parsingUrl(url=new,
                                 downloadPath=path,
                                 picPath=pic_path)
             if data is None:
                 pass
             else:
                 print('SM_detail_url\t', new)
                 print('SM_number\t', number)
                 number += 1
                 if data['releaseTime'] is None or data['releaseTime'] == '':
                     data['releaseTime'] = str(self.point_time)
                 if self.post_DB:
                     s.saveDB(data=data, source=5)
                 else:
                     s.saveMySql(data=data)
         else:
             pass
Пример #5
0
 def mainBuzzFeed(self):
     n = NewsFeeds()
     s = SaveSqlDb()
     bf = BuzzFeed()
     top_urls = bf.parsingTopUrl()
     news_urls = bf.parsingNewsUrl()
     urls_list = top_urls + news_urls
     path = '/data/crawler'
     pic_path = '/buzzfeed/picture/'
     number = 1
     for url in urls_list:
         if url not in self.have_met:
             self.have_met.add(url)
             data = n.parsingUrl(url=url,
                                 downloadPath=path,
                                 picPath=pic_path)
             if data is None:
                 pass
             else:
                 print('BF_detail_url\t', url)
                 print('BF_number\t', number)
                 number += 1
                 if data['releaseTime'] is None or data['releaseTime'] == '':
                     data['releaseTime'] = str(self.point_time)
                 if self.post_DB:
                     s.saveDB(data=data, source=3)
                 else:
                     s.saveMySql(data=data)
         else:
             pass
Пример #6
0
 def mainGoogleNews(self):
     n = NewsFeeds()
     s = SaveSqlDb()
     gn = GoogleNews()
     news_list = gn.googleNews()
     path = '/data/crawler'
     pic_path = '/googleNews/picture/'
     number = 1
     for new in news_list:
         url = new.link.text
         if url not in self.have_met:
             data = n.parsingUrl(url=url,
                                 downloadPath=path,
                                 picPath=pic_path)
             if data is None:
                 pass
             else:
                 print('GN_detail_url\t', url)
                 print('GN_number\t', number)
                 number += 1
                 if data['releaseTime'] is None or data['releaseTime'] == '':
                     data['releaseTime'] = str(self.point_time)
                 if self.post_DB:
                     s.saveDB(data=data, source=4)
                 else:
                     s.saveMySql(data=data)
         else:
             pass
Пример #7
0
 def parsingShopping(self):
     try:
         pg = 1
         urls_shopping = []
         while pg < 6:
             shopping_url = 'https://www.buzzfeed.com/us/feedpage/feed/shopping?page='+str(pg)+'&page_name=shopping'
             res = requests.get(url=shopping_url, headers=self.headers, cookies=self.cookies).text
             html = etree.HTML(res)
             urls_list = html.xpath('//a[@class="js-card__link link-gray"]/@href')
             urls_shopping += urls_list
             pg += 1
         return urls_shopping
     except BaseException as e:
         NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())), 'BuzzFeedParsingShoppingUrl]\t', str(e))
Пример #8
0
 def parsingNewsUrl(self):
     try:
         pg = 1
         urls_news = []
         while pg < 6:
             new_url = 'https://www.buzzfeednews.com/us/feed/home?page=' + str(pg) + '&flexpro_enabled=1'
             res = requests.get(url=new_url, headers=self.headers, cookies=self.cookies).text
             html = etree.HTML(res)
             url_list = html.xpath('//div[@class="news-feed grid-layout-main"]//article/a/@href')
             urls_news += url_list
             pg += 1
         return urls_news
     except BaseException as e:
         NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())), 'BuzzFeedParsingNewsUrl]\t', str(e))
Пример #9
0
 def parsingGet(self, url):
     try:
         res = requests.get(url, headers=self.headers,
                            cookies=self.cookies).text
         data = json.loads(res)
         result = data["documents"]
         urlList = []
         for i in range(len(result)):
             sourceUrl = result[i]['url']
             urlList.append(sourceUrl)
         return urlList
     except BaseException as e:
         NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())),
                               'NewsBreakGet\t', str(e))
Пример #10
0
    def sendRequest(self, url):
        try:
            res = requests.post(url=url,
                                headers=self.headers,
                                cookies=self.cookies).text
            data = json.loads(res)
            item = data['data']['items']
            # 分析获取的json
            urlList = []
            for i in range(len(item)):
                # 数据来源
                sourceUrl = item[i]['article_url']
                urlList.append(sourceUrl)
            return urlList

        except BaseException as e:
            NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())),
                                  'TopBuzzSendRequest\t', str(e))
Пример #11
0
 def parsingTopUrl(self):
     try:
         pg = 1
         urls_tops = []
         while pg <6:
             top_url = 'https://www.buzzfeed.com/site-component/v1/en-us/morebuzz?page='+str(pg)+'&page_size=15&image_crop=wide'
             res = requests.get(url=top_url, headers=self.headers, cookies=self.cookies).text
             data = json.loads(res)
             results = data['results']
             for i in range(len(results)):
                 res_url = results[i]['url']
                 if res_url is None or res_url == '':
                     pass
                 urls_tops.append(res_url)
             pg += 1
         return urls_tops
     except BaseException as e:
         NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())), 'BuzzFeedParsingTopUrl]\t', str(e))
Пример #12
0
    def parsingPost(self, url):
        try:

            res = requests.post(url=url,
                                headers=self.headers,
                                data=self.data,
                                cookies=self.cookies).text
            data = json.loads(res)
            result = data['result']
            docList = []
            for i in result:
                try:
                    docList.append(i['docid'])
                except:
                    pass
            return ','.join(i for i in docList)

        except BaseException as e:
            NewsFeeds().point_log(str(NewsFeeds().localTime(time.time())),
                                  'NewsBreakPost', str(e))
Пример #13
0
    def mainTopBuzz(self):
        n = NewsFeeds()
        s = SaveSqlDb()
        tb = TopBuzz()

        # 访问时间
        t = time.time()
        #正则匹配时间戳小数位
        result = re.findall('.\d*', str(t))
        sign = tb.hash_code(result[1][1:])
        timestamp = result[0]
        url_tb = 'https://i16-tb.isnssdk.com/api/844/stream?session_impr_id=0&tab=General&count=20&min_behot_time=1.554174097999E9&loc_mode=7&lac=4314&cid=6439033' \
              '&sign='+sign+ \
              '&timestamp='+timestamp+ \
              '&logo=topbuzz&gender=0&bv_is_auto_play=0&youtube=0&manifest_version_code=844&app_version=8.4.4&iid=6672646082571388678&gaid=54b268f4-52c2-470c-a815-abd1d00acce9&original_channel=gp&channel=gp&fp=TlTrJzK1FYsqFYs5PlU1LMGSL2Xr&device_type=MIX+2&language=en&app_version_minor=8.4.4.01&resolution=2030*1080&openudid=ab50caa43e995042&update_version_code=8440&sys_language=zh&sys_region=cn&os_api=26&tz_name=Asia%2FShanghai&tz_offset=28800&dpi=440&brand=Xiaomi&ac=WIFI&device_id=6672637176796333574&os=android&os_version=8.0.0&version_code=844&hevc_supported=1&device_brand=Xiaomi&device_platform=android&sim_region=cn&region=us&aid=1106&ui_language=en'
        news_list = tb.sendRequest(url=url_tb)
        path = '/data/crawler'
        pic_path = '/topbuzz/picture/'
        number = 1
        for url in news_list:
            if url not in self.have_met:
                self.have_met.add(url)
                data = n.parsingUrl(url=url,
                                    downloadPath=path,
                                    picPath=pic_path)
                if data is None:
                    pass
                else:
                    print('TB_detail_url\t', url)
                    print('TB_number\t', number)
                    number += 1
                    if data['releaseTime'] is None or data['releaseTime'] == '':
                        data['releaseTime'] = str(self.point_time)
                    if self.post_DB:
                        s.saveDB(data=data, source=1)
                    else:
                        s.saveMySql(data=data)
            else:
                pass