def get_text_by_base_url(self): robots = RobotsCache(capacity=100) if not robots.allowed(self.base_url, "python-requests"): return ["Crawling this site is not allowed by robots.txt"] text_list = [] for slug in self.__get_links_by_url_depth(): sleep(0.5) text_list.append( remove_emoji( remove_url(self.__get_text_by_url(self.base_url + slug))).strip()) return text_list
def get_tweets(self): all_tweets = [] try: for new_tweet in self.__limit_handled( tweepy.Cursor(self.api.user_timeline, screen_name=SCREEN_NAME, include_rts=False, exclude_replies=True).items()): all_tweets.append(new_tweet) except RuntimeError as e: # RuntimeError: generator raised StopIteration print(e) print("Tweet Num {}".format(len(all_tweets))) return [remove_emoji(x.text) for x in all_tweets]
def parse_company_page(content=None, proxy=None): logger = utils.get_logger() document = etree.HTML(content.decode('utf-8')) company_page_url = document.xpath('//p[@class="company-name-t"]/a/@href') if company_page_url: company_page_url = company_page_url[0] company_page = download_page(url=company_page_url, method='get', proxy=proxy) logger.info('parse_company_page 获取公司信息成功 %s ' % company_page_url) if company_page['code'] == 0: try: company_page_ = utils.remove_emoji( company_page['data']).decode('utf-8', 'ignore') return company_page_ except Exception as e: logger.error('公司页面编码失败 %s ' % company_page_url) else: logger.error('parse_company_page 没有解析到公司详情页面url') return ''
def group_tiplines_by_language( tip_line_requests, languages=['en', 'pt', 'hi', 'mr', 'bn', 'ta', 'te', 'ml']): for tip in tip_line_requests: tip['text'] = remove_emoji( tip['media_text'] if tip['media_text'] != 'NA' and len(tip['media_text']) >= len(tip['media_title']) else tip['media_title']) lang_data = cld3.get_language(tip['text']) if lang_data is not None and lang_data.probability >= 0.95: tip['language'] = lang_data.language tip_line_requests = [ tip for tip in tip_line_requests if tip['text'] != 'NA' and not tip['text'].isspace() and 'language' in tip and ( 60 <= len(tip['text']) <= 1200) and not contains_url(tip['text']) and not contains_phone_number(tip['text']) ] return [{ 'text': item['text'], 'language': item['language'], 'source': SourceName.TIPLINE.value } for item in tip_line_requests if item['language'] in languages]
def get_trends_tokyo(self): return [ remove_emoji(x["name"]) for x in self.api.trends_place(1118285)[0]["trends"] ]
def save_db(page_index=None, page_num=None, page_content=None, page_url=None, city_name=None, jd_layout_time=None, data=None, company_page=None, proxy=None, pay_data=None): logger = utils.get_logger() flg = False ''' 保存原文到数据库 ''' try: # url = data['url'] city_url = data['cityUrl'] # data['url'] = '' data['cityUrl'] = '' track_id = str(uuid.uuid4()) if proxy: proxy = proxy['http'].replace('http://', '') page_content = utils.remove_emoji(page_content).decode( 'utf8', 'ignore') if not pay_data: pay_data = {} pay_data['content'] = page_content sql = 'INSERT INTO jd_raw(trackId,source,content,createTime,createBy,ip,pageNum,pageIndex,pageUrl,jobCity,searchConditions,jdLayoutTime)VALUES(%s,"ZHI_LIAN",%s,now(),"python",%s,%s,%s,%s,%s,%s,%s)' sql_val = [ uuid.uuid1(), json.dumps(pay_data, ensure_ascii=False), proxy, page_num, page_index, page_url, city_name.decode('utf-8', 'ignore'), json.dumps(data, ensure_ascii=False), jd_layout_time ] # data['url'] = url data['cityUrl'] = city_url kafka_data = { "channelType": "WEB", "content": { "trackId": track_id, "content": json.dumps(pay_data, ensure_ascii=False), "id": '', "createTime": int(time.time() * 1000), "createBy": "python", "ip": proxy, "phoneUrl": '', "ocrImg": '', "jdLayoutTime": jd_layout_time, "pageUrl": page_url, "pageNum": page_num, "pageIndex": page_index, "jobCity": city_name.decode('utf-8', 'ignore'), "companyImgs": '', "source": "ZHI_LIAN", "searchConditions": json.dumps(data, ensure_ascii=False), "contactInfo": company_page.decode('utf-8', 'ignore'), }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "JD_SEARCH", 'protocolType': 'HTTP', "source": "ZHI_LIAN", "trackId": track_id, } utils.save_data(sql, sql_val, kafka_data) logger.info('保存数据库成功') flg = True except Exception: logger.error('保存数据库异常 %s %s' % (page_url, traceback.format_exc())) flg = False return flg
def get_post_info(self): keyword = '' for i, word in enumerate(self.args.keyword): if i == 0: keyword += word else: keyword += "+" + word # db 사용 시 # if self.args.is_db: # row_id = self.db_model.set_daily_log(keyword, 1) start_url = "https://www.youtube.com/results?search_query=" # search term setting term_dict = { 'lasthour': 'AQ', 'today': 'Ag', 'thisweek': 'Aw', 'thismonth': 'BA', 'thisyear': 'BQ' } # 키워드 및 기간 설정 url 생성 try: start_url = start_url + keyword + f'&sp=EgII{term_dict[self.args.choose_period]}%253D%253D' except ValueError: print('choose lasthour, today, thisweek, thismonth, thisyear') chrome_options = wd.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') driver = wd.Chrome( executable_path="/Users/oldman/Documents/crawler/chromedriver", chrome_options=chrome_options) driver.get(start_url) # 페이지 높이 설정 last_page_height = driver.execute_script( "return document.documentElement.scrollHeight") # 스크롤 끝까지 내리고 이전 시점과 스크롤 높이가 같다면 멈춤 # 스크롤 횟수 argparser 추가 while True: self.args.post_scroll_num -= 1 driver.execute_script( "window.scrollTo(0, document.documentElement.scrollHeight);") time.sleep(3) # Scroll down to bottom new_page_height = driver.execute_script( "return document.documentElement.scrollHeight") if new_page_height == last_page_height: break elif self.args.post_scroll_num == 0: break last_page_height = new_page_height print('Post url crawling complete.') html_source = driver.page_source soup = BeautifulSoup(html_source, 'lxml') for title in soup.select("a#video-title"): unique_id = title.get('href')[9:] driver.get("https://www.youtube.com/" + title.get('href')) print("https://www.youtube.com/" + title.get('href')) last_page_height = driver.execute_script( "return document.documentElement.scrollHeight") # RDBM log stack body_is_new = { 'is_new': False, 'last_time_update': '1970-01-01 00:00:00' } # comment scroll # 스크롤 횟수 argparser 추가 while True: self.args.comment_scroll_num -= 1 driver.execute_script( "window.scrollTo(0, document.documentElement.scrollHeight);" ) time.sleep(3.5) new_page_height = driver.execute_script( "return document.documentElement.scrollHeight") if new_page_height == last_page_height: break elif self.args.comment_scroll_num == 0: break last_page_height = new_page_height print('Comment scroll complete.') html_source = driver.page_source soup = BeautifulSoup(html_source, 'lxml') # live streaming 여부 확인 find_live = soup.select('div#date > yt-formatted-string') # waiting clip 여부 확인 find_wait = soup.select( 'div#count > yt-view-count-renderer > span') try: try: live_tmp = str(find_live[0].text).replace( '\n', '').replace('\t', '').replace(' ', '') except ValueError: live_tmp = 'No streaming' except IndexError: live_tmp = 'No streaming' try: try: wait_tmp = str(find_wait[0].text).replace( '\n', '').replace('\t', '').replace(' ', '') except ValueError: wait_tmp = 'No Premieres' except IndexError: wait_tmp = 'No Premieres' live_comp = re.compile('Started streaming') wait_comp = re.compile('waiting') live_match = live_comp.match(live_tmp) wait_match = wait_comp.match(wait_tmp) if live_match: print('This is streaming') # live일 경우 elif wait_match: print('This is Premieres') # waiting clip일 경우 else: # 모두 아닐 경우 soup = BeautifulSoup(html_source, 'lxml') clip_content = soup.select( '#description > yt-formatted-string') contents = [''] for content in clip_content: str_tmp = utils.remove_emoji( str(content.text).replace('\n', '').replace( '\t', '').replace(' ', '')) contents = contents[0] + ' ' + str_tmp # 댓글 개수 print('comment_num is: ', len(soup.select('#author-text > span'))) try: like_count = utils.remove_str( soup.find( "yt-formatted-string", { "class": "style-scope ytd-toggle-button-renderer style-text" })["aria-label"][0]) except KeyError: like_count = 0 try: dislike_count = utils.remove_str( soup.find( "yt-formatted-string", { "class": "style-scope ytd-toggle-button-renderer style-text" })["aria-label"][1]) except KeyError: dislike_count = 0 try: posting_date = datetime.strptime( soup.select('#date > yt-formatted-string')[0].text, "%Y-%m-%d").strftime("%Y-%m-%d %H:%M:%S") except ValueError: posting_date = utils.conv_date2( soup.select('#date > yt-formatted-string')[0].text) view_count = utils.remove_str( soup.select( '#count > ytd-video-view-count-renderer > span.view-count.style-scope.ytd-video-view-count-renderer' )[0].text.replace('\n', '').replace('\t', '').replace( '조회수 ', '').replace('회', '')) view_comp = re.compile('대기 중') view_match = view_comp.match(str(view_count)) if view_count == '조회수 없음': view_count = 0 elif view_match: continue # post information post_dict = { "unique_id": unique_id, "keyword": keyword, "title": utils.addslashes( soup.select('#container > h1 > yt-formatted-string') [0].text.replace('\n', '').replace('\t', '').replace( ' ', '')), "user_id": 0, "user_name": utils.addslashes( soup.select('#text > a')[0].text.replace( '\n', '').replace('\t', '').replace(' ', '')), "posting_date": posting_date, "view_count": view_count, "like_count": utils.conv_digit(like_count), "dislike_count": utils.conv_digit(dislike_count), "contents": utils.addslashes(contents), "user_follow": 0, "user_follower": utils.conv_digit( soup.select('yt-formatted-string#owner-sub-count') [0].text), "user_medias": 0, "comment_count": len(soup.select('#author-text > span')) } # if self.args.is_db: # body_is_new = self.db_model.set_data_body(1, post_dict) self.post_list.append(post_dict) # comment information for i in range(len(soup.select('#author-text > span'))): try: comment_like = utils.remove_str( soup.find( "span", { "class": "style-scope ytd-comment-action-buttons-renderer" })["aria-label"][i]) except (IndexError, KeyError): comment_like = 0 try: comment_dict = { "unique_id": unique_id, "keyword": keyword, "user_name": utils.addslashes( soup.select('#author-text > span')[i].text.replace( '\n', '').replace('\t', '').replace( ' ', '').replace(' ', '')), "comment_date": utils.conv_date( soup.select( '#header-author > yt-formatted-string > a') [i].text).strftime("%Y-%m-%d %H:%M:%S"), "comment": utils.addslashes( soup.select('#content-text')[i].text.replace( '\n', '').replace('\t', '').replace( ' ', '').replace(' ', '')), "comment_like": utils.conv_digit(comment_like) } except IndexError: comment_dict = { "unique_id": unique_id, "keyword": keyword, "user_name": utils.addslashes( soup.select('#author-text > span')[i].text.replace( '\n ', '').replace('\t', '').replace( ' ', '').replace(' ', '')), "comment_date": utils.addslashes( soup.select( '#header-author > yt-formatted-string > a') [i].text).strftime("%Y-%m-%d %H:%M:%S"), "comment": utils.addslashes( soup.select('#content-text')[i].text.replace( '\n ', '').replace('\t', '').replace( ' ', '').replace(' ', '')), "comment_like": utils.conv_digit(comment_like) } # RDBM 사용시 활성화 # if self.args.is_db: # self.db_model.set_data_comment(1, comment_dict, body_is_new['is_new'], # body_is_new['last_time_update']) self.comment_list.append(comment_dict) driver.quit() # if self.args.is_db: # self.db_model.set_daily_log('', '', row_id) print("Done") print(f'Crawled post num: {len(self.post_list)}\n' f'Crawled comment num: {len(self.comment_list)}') return self.post_list, self.comment_list
def save_db(page_content=None, proxy=None, city_name=None, phone_url=None, jd_layout_time=None, page_url=None, company_img=None, page_num=None, page_index=None, post_data=None, pay_data=None): logger = utils.get_logger() try: # print post_data city_url = post_data['cityUrl'] func_url = post_data['funcUrl'] # city url跟funcurl 不入raw表 post_data['cityUrl'] = '' post_data['funcUrl'] = '' page_content = utils.remove_emoji(page_content).decode( 'utf8', 'ignore') pay_data['content'] = page_content track_id = str(uuid.uuid4()) sql = 'insert into jd_raw(trackId,source,content,createTime,createBy,ip,pageNum,pageIndex,pageUrl,jobCity,jdLayoutTime,ocrImg,searchConditions)values(%s,"GJ_HR",%s,now(),"python",%s,%s,%s,%s,%s,%s,%s,%s)' # search_conditions = {'post_count': post_data['post_count'], 'resume_count': post_data['resume_count'],'':post_data} sql_val = [ track_id, json.dumps(pay_data, ensure_ascii=False), proxy["http"].replace("http://", ""), page_num, page_index, page_url, city_name, jd_layout_time, phone_url, json.dumps(post_data, ensure_ascii=False) ] post_data['cityUrl'] = city_url post_data['funcUrl'] = func_url kafka_data = { "channelType": "WEB", "content": { "trackId": track_id, "content": json.dumps(pay_data, ensure_ascii=False), "id": '', "createTime": int(time.time() * 1000), "createBy": "python", "ip": proxy["http"].replace("http://", ""), "ocrImg": phone_url, "jdLayoutTime": jd_layout_time, "pageUrl": page_url, "pageNum": page_num, "pageIndex": page_index, "companyImgs": company_img, "source": "GJ_HR", "jobCity": city_name, "searchConditions": json.dumps(post_data, ensure_ascii=False) }, "interfaceType": "PARSE", "resourceDataType": "RAW", "resourceType": "JD_SEARCH", 'protocolType': 'HTTP', "source": "GJ_HR", "trackId": track_id, } flg = utils.save_data(sql, sql_val, kafka_data) logger.info('入库成功 %s ', str(flg)) except Exception as e: logger.error(traceback.format_exc())