def get_scrap_dict(self, scrap_id): try: scrap_dict = self.mongo.crawler.scrap_data.find_one( {'_id': scrap_id}) return scrap_dict except Exception as e: print_exception_info()
def get_res(url, headers=None, proxy=None): try: res = requests.get(url, headers=headers, proxies=proxy) if res.status_code == 200: return res except Exception as e: print_exception_info()
def list_parser(list_data): try: items = list_data.get('items') if items: for item in items: yield item.get('id').get('videoId') except Exception as e: print_exception_info()
def get_data(url, headers=None, proxy=None): try: res = get_res(url, headers, proxy) if res: raw_data = get_json(res) return raw_data except Exception as e: print_exception_info()
def get_ptoken(list_data): try: page_token = list_data.get('nextPageToken') if page_token: return page_token else: return '' except Exception as e: print_exception_info()
def get_search(self): try: key_id = self.get_keyword_id() smongo = SearchMongo() search = smongo.get_search_dict(key_id) smongo.close() return search if search else None except Exception as e: print_exception_info()
def get_last_id(self): try: documents = [x for x in self.mongo.crawler.scrap_data.find()] if documents: return int(documents[-1]['_id']) else: return 0 except Exception as e: print_exception_info()
def insert_scrap_dict(self): try: scrap_dict = self.get_scrap_dict({'url': self.url}) if scrap_dict: self.delete_scrap_dict() self.mongo.crawler.scrap_data.insert_one(self.scrap) except Exception as e: print_exception_info() else: print('[ok]')
def get_keyword_id(self): try: cmongo = CrawlerMongo() crawl = cmongo.get_crawler_dict(self.crw_id) cmongo.close() if crawl: return crawl['key_id'] else: return 0 except Exception as e: print_exception_info()
def reply_parser(reply_data): idx = 0 try: items = reply_data.get('items') if items: for item in items: reply = Reply() idx += 1 comment_snippet = item['snippet']['topLevelComment'][ 'snippet'] reply.id = idx reply.username = comment_snippet['authorDisplayName'] reply.text = comment_snippet['textDisplay'] if item.get('replies'): reply.rreply = [ x for x in YoutubeParser.rereply_parser( item['replies']) ] yield reply.reply except Exception as e: print_exception_info()
def tag_crawl(self, scrap_data): print('tag crawl') try: smongo = ScrapMongo() scrap_id = smongo.get_last_id() std_date = self.end_date while std_date >= self.start_date: vid_list = scrap_data['list'](std_date) for vid in vid_list: print(base_urls[self.platform] + vid) scrap_id += 1 body = scrap_data['body'](vid) if body: scrap = body scrap.id = scrap_id scrap.reply = scrap_data['reply'](vid) smongo.scrap = scrap.scrap smongo.insert_scrap_dict() std_date = date_timedelta(std_date, days=-1) smongo.close() except Exception as e: print_exception_info()
def body_parser(body_data): scrap = ScrapData() video_url = 'https://www.youtube.com/watch?v=' try: items = body_data.get('items') if items: snippet = items[0]['snippet'] scrap.title = snippet['title'] scrap.userid = snippet['channelId'] scrap.username = snippet['channelTitle'] scrap.text = snippet['description'] scrap.url = video_url + items[0]['id'] statistics = body_data['items'][0]['statistics'] scrap.view = int(statistics['viewCount']) scrap.like = int(statistics['likeCount']) if statistics.get( 'likeCount') else 0 scrap.profile = snippet['thumbnails']['default']['url'] m_date = p.search(snippet['publishedAt']) date_str = m_date.group(1) + ' ' + m_date.group(2) scrap.publishedat = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S') return scrap except Exception as e: print_exception_info()
def insert_search_dict(self): try: self.mongo.crawler.search.insert_one(self.search) except Exception as e: print_exception_info(e)
def delete_scrap_dict(self): try: self.mongo.crawler.scrap_data.delete_one({'url': self.url}) except Exception as e: print_exception_info()
def connect(self): try: mongo = MongoClient('localhost', 27017) return mongo except Exception as e: print_exception_info()
def get_search_dict(self, key_id): try: search_dict = self.mongo.crawler.search.find_one({'_id': key_id}) return search_dict except Exception as e: print_exception_info(e)
def get_json(res): try: json_data = res.json() return json_data except Exception as e: print_exception_info()
def get_scrap_id(self, crw_id): try: pass except Exception as e: print_exception_info()