示例#1
0
 def get_scrap_dict(self, scrap_id):
     try:
         scrap_dict = self.mongo.crawler.scrap_data.find_one(
             {'_id': scrap_id})
         return scrap_dict
     except Exception as e:
         print_exception_info()
示例#2
0
def get_res(url, headers=None, proxy=None):
    try:
        res = requests.get(url, headers=headers, proxies=proxy)
        if res.status_code == 200:
            return res
    except Exception as e:
        print_exception_info()
示例#3
0
 def list_parser(list_data):
     try:
         items = list_data.get('items')
         if items:
             for item in items:
                 yield item.get('id').get('videoId')
     except Exception as e:
         print_exception_info()
示例#4
0
def get_data(url, headers=None, proxy=None):
    try:
        res = get_res(url, headers, proxy)
        if res:
            raw_data = get_json(res)
            return raw_data
    except Exception as e:
        print_exception_info()
示例#5
0
 def get_ptoken(list_data):
     try:
         page_token = list_data.get('nextPageToken')
         if page_token:
             return page_token
         else:
             return ''
     except Exception as e:
         print_exception_info()
示例#6
0
 def get_search(self):
     try:
         key_id = self.get_keyword_id()
         smongo = SearchMongo()
         search = smongo.get_search_dict(key_id)
         smongo.close()
         return search if search else None
     except Exception as e:
         print_exception_info()
示例#7
0
 def get_last_id(self):
     try:
         documents = [x for x in self.mongo.crawler.scrap_data.find()]
         if documents:
             return int(documents[-1]['_id'])
         else:
             return 0
     except Exception as e:
         print_exception_info()
示例#8
0
 def insert_scrap_dict(self):
     try:
         scrap_dict = self.get_scrap_dict({'url': self.url})
         if scrap_dict:
             self.delete_scrap_dict()
         self.mongo.crawler.scrap_data.insert_one(self.scrap)
     except Exception as e:
         print_exception_info()
     else:
         print('[ok]')
示例#9
0
 def get_keyword_id(self):
     try:
         cmongo = CrawlerMongo()
         crawl = cmongo.get_crawler_dict(self.crw_id)
         cmongo.close()
         if crawl:
             return crawl['key_id']
         else:
             return 0
     except Exception as e:
         print_exception_info()
示例#10
0
 def reply_parser(reply_data):
     idx = 0
     try:
         items = reply_data.get('items')
         if items:
             for item in items:
                 reply = Reply()
                 idx += 1
                 comment_snippet = item['snippet']['topLevelComment'][
                     'snippet']
                 reply.id = idx
                 reply.username = comment_snippet['authorDisplayName']
                 reply.text = comment_snippet['textDisplay']
                 if item.get('replies'):
                     reply.rreply = [
                         x for x in YoutubeParser.rereply_parser(
                             item['replies'])
                     ]
                 yield reply.reply
     except Exception as e:
         print_exception_info()
示例#11
0
 def tag_crawl(self, scrap_data):
     print('tag crawl')
     try:
         smongo = ScrapMongo()
         scrap_id = smongo.get_last_id()
         std_date = self.end_date
         while std_date >= self.start_date:
             vid_list = scrap_data['list'](std_date)
             for vid in vid_list:
                 print(base_urls[self.platform] + vid)
                 scrap_id += 1
                 body = scrap_data['body'](vid)
                 if body:
                     scrap = body
                     scrap.id = scrap_id
                     scrap.reply = scrap_data['reply'](vid)
                 smongo.scrap = scrap.scrap
                 smongo.insert_scrap_dict()
             std_date = date_timedelta(std_date, days=-1)
         smongo.close()
     except Exception as e:
         print_exception_info()
示例#12
0
 def body_parser(body_data):
     scrap = ScrapData()
     video_url = 'https://www.youtube.com/watch?v='
     try:
         items = body_data.get('items')
         if items:
             snippet = items[0]['snippet']
             scrap.title = snippet['title']
             scrap.userid = snippet['channelId']
             scrap.username = snippet['channelTitle']
             scrap.text = snippet['description']
             scrap.url = video_url + items[0]['id']
             statistics = body_data['items'][0]['statistics']
             scrap.view = int(statistics['viewCount'])
             scrap.like = int(statistics['likeCount']) if statistics.get(
                 'likeCount') else 0
             scrap.profile = snippet['thumbnails']['default']['url']
             m_date = p.search(snippet['publishedAt'])
             date_str = m_date.group(1) + ' ' + m_date.group(2)
             scrap.publishedat = datetime.strptime(date_str,
                                                   '%Y-%m-%d %H:%M:%S')
         return scrap
     except Exception as e:
         print_exception_info()
示例#13
0
 def insert_search_dict(self):
     try:
         self.mongo.crawler.search.insert_one(self.search)
     except Exception as e:
         print_exception_info(e)
示例#14
0
 def delete_scrap_dict(self):
     try:
         self.mongo.crawler.scrap_data.delete_one({'url': self.url})
     except Exception as e:
         print_exception_info()
示例#15
0
 def connect(self):
     try:
         mongo = MongoClient('localhost', 27017)
         return mongo
     except Exception as e:
         print_exception_info()
示例#16
0
 def get_search_dict(self, key_id):
     try:
         search_dict = self.mongo.crawler.search.find_one({'_id': key_id})
         return search_dict
     except Exception as e:
         print_exception_info(e)
示例#17
0
def get_json(res):
    try:
        json_data = res.json()
        return json_data
    except Exception as e:
        print_exception_info()
示例#18
0
 def get_scrap_id(self, crw_id):
     try:
         pass
     except Exception as e:
         print_exception_info()