class PttIpAsnCrawler(object): def __init__(self, arguments: Dict): self.db_input = arguments['database'] or False self.ip_list = ('' if self.db_input else arguments['ip_list']) config_path = (arguments['config_path'] or 'config.ini') self.config = load_config(config_path) self.database_config = self.config['Database'] self._init_database() if arguments['verbose']: logging.getLogger().setLevel(logging.DEBUG) def _init_database(self): self.db = PttDatabase(dbtype=self.database_config['Type'], dbname=self.database_config['Name']) self.db_session = self.db.get_session() def _get_ip_list(self): if self.db_input: return list( map(lambda ipasn: str(ipasn.ip), self.db_session.query(IpAsn).order_by(IpAsn.asn).all())) else: return self.ip_list.split(',') @log('Output_Database') def _output_database(self, result: List[Dict[str, str]]): self.db.bulk_update(self.db_session, IpAsn, result) @log() def crawling(self): ip_list = self._get_ip_list() ip_result = [] for ip in ip_list: if ip: net = Net(ip) obj = IPASN(net) result = {'ip': ip} result.update(obj.lookup()) result['asn_date'] = datetime.strptime(result['asn_date'], '%Y-%m-%d') ip_result.append(result) if len(ip_result) % 100 == 0: self._output_database(ip_result) ip_result = [] self._output_database(ip_result)
class QueryHelper(object): def __init__(self, arguments: Dict[str, str]): config_path = (arguments['config_path'] if arguments['config_path'] else 'config.ini') self.start_date, self.end_date = arguments['date_range'] self.board_name = arguments['board_name'] self.file_format = arguments['format'] self.config = load_config(config_path) self.output_folder = arguments['output_folder'] self.output_prefix = arguments['output_prefix'] self.db = PttDatabase(dbtype=self.config['Database']['Type'], dbname=self.config['Database']['Name']) self.db_session = self.db.get_session() @log() def _get_export_rows(self): rows = [['Type', 'Board', 'Start date', 'End date', 'TW Ip', 'Not TW Ip']] tw_ip_label = case(value=IpAsn.asn_country_code, whens={'TW': True}, else_=False).label("TW_IP") article_res = self.db_session.query(Article, ArticleHistory, tw_ip_label) \ .join(ArticleHistory, ArticleHistory.article_id == Article.id) \ .join(Board, Board.id == Article.board_id) \ .order_by(ArticleHistory.id) \ .group_by(Article.id) \ .join(IpAsn, IpAsn.ip == Article.post_ip) \ .filter(Board.name == self.board_name).all() article_tw_ip = sum(1 for _, _, tw_ip in article_res if tw_ip == True) article_not_tw_ip = sum(1 for _, _, tw_ip in article_res if tw_ip == False) rows.append(['Article', self.board_name, str(self.start_date or ''), str(self.end_date or ''), article_tw_ip or '0', article_not_tw_ip or '0']) article_history_id_list = [] for res in article_res: _, history, _ = res article_history_id_list.append(history.id) push_res = self.db_session.query(Push, tw_ip_label) \ .join(IpAsn, IpAsn.ip == Push.push_ip) \ .filter(Push.article_history_id.in_(article_history_id_list)).all() push_tw_ip = sum(1 for _, tw_ip in push_res if tw_ip == True) push_not_tw_ip = sum(1 for _, tw_ip in push_res if tw_ip == False) rows.append(['Push', self.board_name, str(self.start_date or ''), str(self.end_date or ''), push_tw_ip or '0', push_not_tw_ip or '0']) return rows def _print_rows(self): data = self._get_export_rows() for idx, row in enumerate(data): print('{:8} | {:16} | {:20} | {:20} | {:5} | {:8}'.format( *map(str, row))) if idx == 0: print( '---------+------------------+----------------------+----------------------+-------+----------') def _export_ods(self): data = {'Query': self._get_export_rows()} output_filename = 'Ptt_query_{export_datetime}'.format( export_datetime=datetime.now().strftime('%Y-%m-%d')) output_path = os.path.join( self.output_folder, '{filename}.ods'.format(filename=output_filename)) save_data(output_path, data) def _export_csv(self): data = self._get_export_rows() output_filename = 'Ptt_query_{export_datetime}'.format( export_datetime=datetime.now().strftime('%Y-%m-%d')) csv_path = os.path.join( self.output_folder, '{filename}.csv'.format(filename=output_filename)) with open(csv_path, 'w') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',') for row in data: csvwriter.writerow(row) def go(self): if self.file_format == 'console': self._print_rows() elif self.file_format == 'ods': self._export_ods() elif self.file_format == 'csv': self._export_csv()
class PttUserCrawler(object): PTT_WEB_URL = 'http://term.ptt.cc/' def __init__(self, arguments: Dict): self.db_input = arguments['database'] or False self.id_list = ('' if self.db_input else arguments['id']) config_path = (arguments['config_path'] or 'config.ini') self._init_config(config_path) self._init_database() self._init_browser() self.ptt_browser_buffer_logger = logging.getLogger(__name__ + '.log') self.json_prefix = arguments['json_prefix'] self.debug_mode = arguments['debug_mode'] if arguments['verbose']: logging.getLogger().setLevel(logging.DEBUG) def _init_config(self, config_path: str): self.config = load_config(config_path) if self.config['PttUser']['Output'] == 'both': self.json_output = True self.database_output = True elif self.config['PttUser']['Output'] == 'database': self.json_output = False self.database_output = True elif self.config['PttUser']['Output'] == 'json': self.json_output = True self.database_output = False else: self.json_output = False self.database_output = False def _init_database(self): self.db = PttDatabase(dbtype=self.config['Database']['Type'], dbname=self.config['Database']['Name']) self.db_session = self.db.get_session() def _init_browser(self): if sys.platform.startswith('linux'): platform = 'linux' exe_filename = 'chromedriver' elif sys.platform.startswith('win'): platform = 'windows' exe_filename = 'chromedriver.exe' else: platform = 'mac' exe_filename = 'chromedriver' self.webdriver_path = os.path.join( self.config['PttUser']['WebdriverFolder'], platform, exe_filename) self.chrome_options = ChromeOptions() self.chrome_options.add_argument('--headless') def _get_id_list(self) -> List[str]: if self.db_input: return list( map( lambda user: user.username, self.db_session.query(User).order_by( User.login_times, User.id).all())) else: return self.id_list.split(',') def _output_json(self, result: Dict[str, object], count): json_path = '{prefix}user_{count}.json'.format(prefix=self.json_prefix, count=count) with open(json_path, 'w') as jsonfile: json.dump(result, jsonfile, sort_keys=True, indent=4) @log('Output_Database') def _output_database(self, result: List[Dict[str, object]]): for record in result: user, is_new_user = self.db.get_or_create( self.db_session, User, {'username': record['username']}, { 'username': record['username'], 'login_times': int(record['login_times']), 'valid_article_count': int(record['valid_article_count']) }) if not is_new_user: user.login_times = int(record['login_times']) user.valid_article_count = int(record['valid_article_count']) self.db_session.commit() last_login_datetime = datetime.datetime.strptime( record['last_login_datetime'], '%m/%d/%Y %H:%M:%S %a') last_record = UserLastRecord( last_login_datetime=last_login_datetime, last_login_ip=record['last_login_ip']) last_record.user_id = user.id if record['last_login_ip']: _, _ = self.db.get_or_create(self.db_session, IpAsn, {'ip': record['last_login_ip']}, { 'ip': record['last_login_ip'], 'asn': None, 'asn_cidr': None, 'asn_country_code': None, 'asn_date': None, 'asn_description': None, 'asn_raw': None, 'asn_registry': None }) self.db_session.add(last_record) self.db_session.commit() def _output(self, result: Dict[str, object], count): if self.json_output: self._output_json(result, count) if self.database_output: self._output_database(result) def _login_ptt(self, browser, userid, userpwd): browser.connect(self.PTT_WEB_URL) # Ptt login browser.send_keys(userid) browser.send_keys(userpwd) # 踢掉重複登入 或 刪除密碼嘗試錯誤記錄 buffer = browser.get_buffer() while u"主功能表" not in buffer: browser.send_keys('') buffer = browser.get_buffer() @log() def crawling(self): delaytime = float(self.config['PttUser']['Delaytime']) userid = self.config['PttUser']['UserId'] userpwd = self.config['PttUser']['UserPwd'] id_list = self._get_id_list() crawler_result = [] with PttBrowser(self.webdriver_path, self.chrome_options, debug_mode=self.debug_mode) as browser: browser.ACT_DELAY_TIME = delaytime self._login_ptt(browser, userid, userpwd) # 轉到 Talk -> Query browser.send_keys('T') id_queue = id_list.copy() count = 1 err_count = 0 while len(id_queue) > 0: for user_id in id_list: try: browser.send_keys('Q').send_keys(user_id) buffer = browser.get_buffer() self.ptt_browser_buffer_logger.debug( 'Buffer:\n%s', buffer) pattern = r"[\w\W]*《登入次數》(\d*)\D*次\D*《有效文章》\D*(\d*)[\w\W]*《上次上站》\D*([\d]{1,2}\/[\d]{1,2}\/[\d]{4}\W*[\d]{1,2}:\W*[\d]{1,2}:\W*[\d]{1,2}\W*\w*)\D*《上次故鄉》([\d.]*)" pat = re.compile(pattern) search_result = pat.match(buffer) if search_result: login_times = search_result.group(1) valid_article_count = search_result.group(2) last_login_datetime = search_result.group(3) last_login_ip = search_result.group(4) crawler_result.append({ 'username': user_id, 'login_times': login_times, 'valid_article_count': valid_article_count, 'last_login_datetime': last_login_datetime, 'last_login_ip': last_login_ip }) if len(crawler_result) % 100 == 0: self._output(crawler_result, count) count += 1 crawler_result = [] else: logging.error('User "%s" has error', user_id) self.ptt_browser_buffer_logger.error( 'Buffer:\n%s', buffer) browser.send_keys('') id_queue.remove(user_id) except KeyboardInterrupt: id_queue = [] break except PttDisconnectException as e: err_count += 1 if err_count == 3: raise e browser.send_keys('') self._login_ptt(browser, userid, userpwd) browser.send_keys('T') continue self._output(crawler_result, count) count += 1
class PttArticleIndexCrawler(object): PTT_URL = 'https://www.ptt.cc' PTT_Board_Format = '/bbs/{board}/index{index}.html' def __init__(self, arguments: Dict[str, str]): def get_default_start_url(board_name): last_index = self._getLastPage(board_name) return last_index, ( self.PTT_URL + self.PTT_Board_Format.format(board_name, last_index)) config_path = (arguments['config_path'] or 'config.ini') self._init_config(config_path) self._init_database() self.board_name = arguments['board_name'] self.headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0' } self.cookies = {'over18': '1'} self.before = arguments['before'] logging.info('{}'.format('Before' if self.before else 'After')) if arguments['before']: if arguments['index']: self.end_index = arguments['index'] else: self.end_index = self._getDBLastPage() if not self.end_index: self.end_index = self._getLastPage() self.start_index = 1 else: if arguments['index']: self.start_index = arguments['index'] else: self.start_index = self._getDBLastPage() if not self.start_index: self.start_index = self._getLastPage() self.end_index = self._getLastPage() self.start_url = (self.PTT_URL + self.PTT_Board_Format.format( board=self.board_name, index=self.end_index)) def _init_config(self, config_path: str): self.config = load_config(config_path) self.article_config = self.config['PttArticle'] self.database_config = self.config['Database'] self.NEXT_PAGE_DELAY_TIME = float( self.article_config['NextPageDelaytime']) def _init_database(self): self.db = PttDatabase(dbtype=self.database_config['Type'], dbname=self.database_config['Name']) self.db_session = self.db.get_session() def _getDBLastPage(self): board, _ = self.db.get_or_create(self.db_session, Board, {'name': self.board_name}, {'name': self.board_name}) index_func = (func.min if self.before else func.max) article_index_res = self.db_session \ .query(ArticleIndex.board_id, index_func(ArticleIndex.index)) \ .group_by(ArticleIndex.board_id) \ .filter(ArticleIndex.board_id == board.id) \ .all() if article_index_res and len(article_index_res) > 0: for _, index in article_index_res: return index else: return None def _getLastPage(self, timeout=3): """Ref: https://github.com/jwlin/ptt-web-crawler/blob/f8c04076004941d3f7584240c86a95a883ae16de/PttWebCrawler/crawler.py#L189""" resp = requests.get( url=self.PTT_URL + self.PTT_Board_Format.format(board=self.board_name, index=''), headers=self.headers, cookies=self.cookies, timeout=timeout) self.cookies = resp.cookies self.cookies['over18'] = '1' content = resp.content.decode('utf-8') first_page = re.search(r'href="/bbs/\w+/index(\d+).html">‹', content) if first_page is None: return 1 return int(first_page.group(1)) + 1 @log('Output_Database') def _output_database(self, result: List[Dict[str, object]]): self.db.bulk_update(self.db_session, ArticleIndex, result) def crawling(self): board = self.db.get(self.db_session, Board, {'name': self.board_name}) logging.info('Index range: %d ~ %d', self.start_index, self.end_index) while self.end_index >= self.start_index: ptt_index_url = (self.PTT_URL + self.PTT_Board_Format).format( board=self.board_name, index=self.end_index) logging.info('Processing index: %d, Url = %s', self.end_index, ptt_index_url) resp = requests.get(url=ptt_index_url, headers=self.headers, cookies=self.cookies, timeout=None) self.cookies = resp.cookies self.cookies['over18'] = '1' if resp.status_code != 200: logging.error( 'Processing index error, status_code = %d, Url = %s', resp.status_code, ptt_index_url) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'html.parser') divs = soup.find("div", "r-list-container action-bar-margin bbs-screen") children = divs.findChildren("div", recursive=False) article_list = [] for div in children: # ex. link would be <a href="/bbs/PublicServan/M.1127742013.A.240.html">Re: [問題] 職等</a> try: if 'r-list-sep' in div['class']: break elif 'r-ent' in div['class']: try: href = div.find('a')['href'] link = self.PTT_URL + href article_id = re.sub('\.html', '', href.split('/')[-1]) article_list.append({ 'web_id': article_id, 'board_id': board.id, 'index': self.end_index }) logging.debug('Processing article: %s, Url = %s', article_id, link) except: pass except Exception as e: logging.exception('Processing article error, Url = %s', link) self._output_database(article_list) self.end_index -= 1 time.sleep(self.NEXT_PAGE_DELAY_TIME)
class PttArticleCrawler: PTT_URL = 'https://www.ptt.cc' PTT_Board_Format = '/bbs/{board}/index{index}.html' PTT_Article_Format = '/bbs/{board}/{web_id}.html' DELAY_TIME = 1.0 NEXT_PAGE_DELAY_TIME = 5.0 @log('Initialize') def __init__(self, arguments: Dict): config_path = (arguments['config_path'] or 'config.ini') self._init_config(config_path) self._init_database() self.board = arguments['board_name'] self.timeout = None # self.timeout = float(self.article_config['Timeout']) self.headers = { 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0' } self.cookies = {'over18': '1'} self.start_date = arguments['start_date'] self.from_database = arguments['database'] if not self.from_database: self.start_index, self.end_index = ( arguments['index'] if arguments['index'] else (1, self.getLastPage(self.board, self.timeout))) else: self.start_index, self.end_index = (0, 0) self.upgrade_action = arguments['upgrade'] self.json_folder = arguments['json_folder'] self.json_prefix = arguments['json_prefix'] if arguments['verbose']: logging.getLogger().setLevel(logging.DEBUG) def _init_config(self, config_path: str): self.config = load_config(config_path) self.article_config = self.config['PttArticle'] self.database_config = self.config['Database'] self.DELAY_TIME = float(self.article_config['Delaytime']) self.NEXT_PAGE_DELAY_TIME = float( self.article_config['NextPageDelaytime']) self.VERSION_ROTATE = int(self.article_config['VersionRotate']) or 30 self.json_output = False self.database_output = False if 'Output' in self.article_config: if self.article_config['Output'] == 'both': self.json_output = True self.database_output = True elif self.article_config['Output'] == 'database': self.json_output = False self.database_output = True elif self.article_config['Output'] == 'json': self.json_output = True self.database_output = False def _init_database(self): self.db = PttDatabase(dbtype=self.database_config['Type'], dbname=self.database_config['Name']) self.db_session = self.db.get_session() def _output_json(self, result: Dict[str, object], index): json_name = '{prefix}{board}_{index}.json'.format( prefix=self.json_prefix, board=self.board, index=index) json_path = os.path.join(self.json_folder, json_name) with codecs.open(json_path, 'w', encoding='utf-8') as jsonfile: json.dump(result, jsonfile, sort_keys=True, indent=4, ensure_ascii=False) def _output_index_to_database(self, result: List[tuple]): board, _ = self.db.get_or_create(self.db_session, Board, {'name': self.board}, {'name': self.board}) # board = self.db.get(self.db_session, # Board, # {'name': self.board}) index_list = [] for web_id, link, index in result: logging.debug('web_id = %s, link = %s, index = %d, board.id = %d', web_id, link, index, board.id) index_list.append({ 'web_id': web_id, 'board_id': board.id, 'index': index }) self.db.bulk_update(self.db_session, ArticleIndex, index_list) @log('Output_Database') def _output_database(self, result: List[Dict[str, object]]): def parser_push_ipdatetime(push_ipdatetime): logging.debug('parser_push_ipdatetime(%s)', push_ipdatetime) if push_ipdatetime: match = re.search(r'([\d.]*)\W?(\d{2}\/\d{2}\ \d{2}:\d{2})', push_ipdatetime) if match: push_ip = match.group(1) push_datetime = datetime.strptime(match.group(2), "%m/%d %M:%S") return push_ip, push_datetime logging.warning('push_ipdatetime %s search failed', push_ipdatetime) return None, None def parse_author(author): logging.debug('parse_author(%s)', author) if author: match = re.search(r'([\S]*)\D\((.*)\)', author) if match: return match.group(1) return author for record in result: try: author_username = parse_author(record['author']) if not author_username: logging.warning('author is empty, record = %s', record) author_username = '' author_conditon = {'username': author_username} author_values = { 'username': author_username, 'login_times': 0, 'valid_article_count': 0 } if not self.upgrade_action: article = self.db.get(self.db_session, Article, {'web_id': record['article_id']}) if article: continue user, _ = self.db.get_or_create(self.db_session, User, author_conditon, author_values, auto_commit=False) board, _ = self.db.get_or_create(self.db_session, Board, {'name': record['board']}, {'name': record['board']}, auto_commit=False) try: record['date'] = datetime.strptime(record['date'], '%a %b %d %H:%M:%S %Y') except: record['date'] = None article, is_new_article = self.db.get_or_create( self.db_session, Article, {'web_id': record['article_id']}, { 'web_id': record['article_id'], 'user_id': user.id, 'board_id': board.id, 'post_datetime': record['date'], 'post_ip': record['ip'] }, auto_commit=False) if record['ip']: _, _ = self.db.get_or_create(self.db_session, IpAsn, {'ip': record['ip']}, { 'ip': record['ip'], 'asn': None, 'asn_cidr': None, 'asn_country_code': None, 'asn_date': None, 'asn_description': None, 'asn_raw': None, 'asn_registry': None }, auto_commit=False) if not is_new_article: article.history[0].end_at = datetime.now() self.db_session.flush() history = self.db.create(self.db_session, ArticleHistory, { 'article_id': article.id, 'title': record['article_title'], 'content': record['content'], 'start_at': datetime.now(), 'end_at': datetime.now() }, auto_commit=False) # 更新到最近的文章歷史記錄推文 push_list = [] for (floor, message) in enumerate(record['messages']): push_userid = message['push_userid'] if not push_userid: logging.warning('push_userid is empty, message = %s', message) push_userid = '' push_user_condition = {'username': push_userid} push_user_values = { 'username': push_userid, 'login_times': 0, 'valid_article_count': 0 } push_user, _ = self.db.get_or_create(self.db_session, User, push_user_condition, push_user_values, auto_commit=False) push_ip, push_datetime = parser_push_ipdatetime( message['push_ipdatetime']) push_list.append( Push(article_history_id=history.id, floor=(floor + 1), push_tag=message['push_tag'], push_user_id=push_user.id, push_content=message['push_content'], push_ip=push_ip, push_datetime=push_datetime)) if push_ip: _, _ = self.db.get_or_create( self.db_session, IpAsn, {'ip': push_ip}, { 'ip': push_ip, 'asn': None, 'asn_cidr': None, 'asn_country_code': None, 'asn_date': None, 'asn_description': None, 'asn_raw': None, 'asn_registry': None }, auto_commit=False) self.db.bulk_insert(self.db_session, push_list, auto_commit=False) article = self.db.get(self.db_session, Article, {'id': article.id}) if len(article.history) >= self.VERSION_ROTATE: for h in article.history[self.VERSION_ROTATE:]: self.db_session.delete(h) self.db_session.flush() self.db_session.commit() except: logging.exception('record = %s', record) def parse(self, link, article_id, board, timeout=3): """Ref: https://github.com/jwlin/ptt-web-crawler/blob/f8c04076004941d3f7584240c86a95a883ae16de/PttWebCrawler/crawler.py#L99""" resp = requests.get(url=link, headers=self.headers, cookies=self.cookies, verify=True, timeout=timeout) self.cookies = resp.cookies self.cookies['over18'] = '1' if resp.status_code != 200: return {"error": "invalid url"} # return json.dumps({"error": "invalid url"}, sort_keys=True, ensure_ascii=False) soup = BeautifulSoup(resp.text, 'html.parser') main_content = soup.find(id="main-content") metas = main_content.select('div.article-metaline') author = '' title = '' date = '' if metas: author = (metas[0].select('span.article-meta-value')[0].string if metas[0].select('span.article-meta-value')[0] else author) title = (metas[1].select('span.article-meta-value')[0].string if metas[1].select('span.article-meta-value')[0] else title) date = (metas[2].select('span.article-meta-value')[0].string if metas[2].select('span.article-meta-value')[0] else date) # remove meta nodes for meta in metas: meta.extract() for meta in main_content.select('div.article-metaline-right'): meta.extract() else: logging.info('metas is None in link %s', link) transcription = main_content.find(text=re.compile(u'※ 轉錄者:')) if transcription: # 轉錄文章 match = re.search( r'\W(\w+)\W\([0-9]*\.[0-9]*\.[0-9]*\.[0-9]*\),\W([0-9]+\/[0-9]+\/[0-9]+\W[0-9]+:[0-9]+:[0-9]+)', transcription) if match: author = match.group(1) date = datetime.strptime(match.group(2), "%m/%d/%Y %H:%M:%S") date = date.strftime('%a %b %d %H:%M:%S %Y') else: logging.info('Excuse me WTF!?') raise PostException('此文章被編輯過,解析出現問題。') # remove and keep push nodes pushes = main_content.find_all('div', class_='push') for push in pushes: push.extract() try: ip = main_content.find(text=re.compile(u'※ 發信站:')) ip = re.search('[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*', ip).group() except: ip = None # 移除 '※ 發信站:' (starts with u'\u203b'), '◆ From:' (starts with u'\u25c6'), 空行及多餘空白 # 保留英數字, 中文及中文標點, 網址, 部分特殊符號 filtered = [ v for v in main_content.stripped_strings if v[0] not in [u'※', u'◆'] and v[:2] not in [u'--'] ] expr = re.compile(( r'[^\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\s\w:/-_.?~%()]' )) for i in range(len(filtered)): filtered[i] = re.sub(expr, '', filtered[i]) filtered = [_f for _f in filtered if _f] # remove empty strings # remove last line containing the url of the article filtered = [x for x in filtered if article_id not in x] content = ' '.join(filtered) content = re.sub(r'(\s)+', ' ', content) # print 'content', content # push messages p, b, n = 0, 0, 0 messages = [] for push in pushes: if not push.find('span', 'push-tag'): continue push_tag = (push.find('span', 'push-tag').string or '').strip(' \t\n\r') push_userid = (push.find('span', 'push-userid').string or '').strip(' \t\n\r') # if find is None: find().strings -> list -> ' '.join; else the current way push_content = push.find('span', 'push-content').strings push_content = (' '.join(push_content)[1:]).strip( ' \t\n\r') # remove ':'%a %b %d %H:%M:%S %Y push_ipdatetime = (push.find('span', 'push-ipdatetime').string or '').strip(' \t\n\r') messages.append({ 'push_tag': push_tag, 'push_userid': push_userid, 'push_content': push_content, 'push_ipdatetime': push_ipdatetime }) if push_tag == u'推': p += 1 elif push_tag == u'噓': b += 1 else: n += 1 # count: 推噓文相抵後的數量; all: 推文總數 message_count = { 'all': p + b + n, 'count': p - b, 'push': p, 'boo': b, "neutral": n } # print 'msgs', messages # print 'mscounts', message_count # json data data = { 'url': link, 'board': board, 'article_id': article_id, 'article_title': title, 'author': author, 'date': date, 'content': content, 'ip': ip, 'message_count': message_count, 'messages': messages } # print 'original:', data return data # return json.dumps(data, sort_keys=True, ensure_ascii=False) def getLastPage(self, board, timeout=3): """Ref: https://github.com/jwlin/ptt-web-crawler/blob/f8c04076004941d3f7584240c86a95a883ae16de/PttWebCrawler/crawler.py#L189""" resp = requests.get(url='https://www.ptt.cc/bbs/' + board + '/index.html', headers=self.headers, cookies=self.cookies, timeout=timeout) self.cookies = resp.cookies self.cookies['over18'] = '1' content = resp.content.decode('utf-8') first_page = re.search(r'href="/bbs/\w+/index(\d+).html">‹', content) if first_page is None: return 1 return int(first_page.group(1)) + 1 @log() def crawling(self): logging.debug('Start date = %s', self.start_date) logging.debug('Start = %d, End = %d', self.start_index, self.end_index) logging.debug('From database = %s', str(self.from_database)) if self.from_database: self._crawling_from_db() else: self._crawling_from_arg() @log() def _crawling_from_arg(self): last_page = self.end_index board, _ = self.db.get_or_create(self.db_session, Board, {'name': self.board}, {'name': self.board}) while last_page >= self.start_index: ptt_index_url = (self.PTT_URL + self.PTT_Board_Format).format( board=self.board, index=last_page) logging.debug('Processing index: %d, Url = %s', last_page, ptt_index_url) resp = requests.get(url=ptt_index_url, headers=self.headers, cookies=self.cookies, timeout=self.timeout) self.cookies = resp.cookies self.cookies['over18'] = '1' if resp.status_code != 200: logging.error( 'Processing index error, status_code = %d, Url = %s', resp.status_code, ptt_index_url) resp.raise_for_status() soup = BeautifulSoup(resp.text, 'html.parser') divs = soup.find("div", "r-list-container action-bar-margin bbs-screen") children = divs.findChildren("div", recursive=False) article_link_list = [] for div in children: # ex. link would be <a href="/bbs/PublicServan/M.1127742013.A.240.html">Re: [問題] 職等</a> if 'r-list-sep' in div['class']: break elif 'r-ent' in div['class']: try: href = div.find('a')['href'] link = self.PTT_URL + href article_id = re.sub('\.html', '', href.split('/')[-1]) article_link_list.append((article_id, link, last_page)) except Exception as e: logging.warning('%s href 404', div) else: continue self._output_index_to_database(article_link_list) page_article_count = self.db_session.query(ArticleIndex) \ .join(Article, Article.web_id == ArticleIndex.web_id) \ .filter(ArticleIndex.board_id == board.id, ArticleIndex.index == last_page)\ .count() if not self.upgrade_action and page_article_count == len( article_link_list): pass else: article_list = [] for article_id, link, _ in article_link_list: try: logging.info('Processing article: %s, Url = %s', article_id, link) article_list.append( self.parse(link, article_id, self.board, self.timeout)) time.sleep(self.DELAY_TIME) except Exception as e: logging.exception('Processing article error, Url = %s', link) len_article_list = len(article_list) if self.start_date: tmp_article_list = [] for article in article_list: try: aritcle_date = datetime.strptime( article['date'], '%a %b %d %H:%M:%S %Y') if self.start_date <= aritcle_date: tmp_article_list.append(article) except Exception as e: # 避免因為原文的日期被砍,導致無法繼續處理 len_article_list -= 1 logging.error('%s', e) logging.error('article: %s , date format: %s', article['article_id'], article['date']) if len(tmp_article_list) < len_article_list: self.start_index = last_page article_list = tmp_article_list if self.database_output: self._output_database(article_list) if self.json_output: self._output_json(article_list, last_page) last_page -= 1 time.sleep(self.NEXT_PAGE_DELAY_TIME) @log() def _crawling_from_db(self): board, _ = self.db.get_or_create(self.db_session, Board, {'name': self.board}, {'name': self.board}) # exist_article_list = self.db_session \ # .query(Article.web_id) \ # .filter(Article.board_id == board.id).all() if self.upgrade_action: article_index_list = self.db_session \ .query(ArticleIndex)\ .filter(Article.board_id == board.id).all() else: article_index_list = self.db_session \ .query(ArticleIndex) \ .outerjoin(Article, ArticleIndex.web_id == Article.web_id) \ .filter(Article.id.is_(None), ArticleIndex.board_id == board.id).all() # .filter(ArticleIndex.web_id.notin_(exist_article_list)).all() article_list = [] count = 0 for article_index in article_index_list: link = self.PTT_URL + \ self.PTT_Article_Format.format(board=article_index.board.name, web_id=article_index.web_id) logging.debug('Processing Url = %s', link) article_id = article_index.web_id try: article_list.append( self.parse(link, article_id, self.board, self.timeout)) count += 1 if count == 20: self._output_database(article_list) article_list = [] count = 0 except Exception: pass finally: time.sleep(self.DELAY_TIME) if article_list: self._output_database(article_list)
class PttExportHelper(object): def __init__(self): pass def _init_helper(self, arguments: Dict[str, str]): config_path = (arguments['config_path'] if arguments['config_path'] else 'config.ini') self.config = load_config(config_path) self.file_format = ExportFormat[arguments['format']] self.output_folder = arguments['output_folder'] self.output_prefix = arguments['output_prefix'] if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) self.db = PttDatabase(dbtype=self.config['Database']['Type'], dbname=self.config['Database']['Name']) self.db_session = self.db.get_session() @log('Get Data') def _get_export_rows(self): article_rows = [[ 'Atricle.web_id', 'Article.board', 'Atricle.author', 'Atricle.title', 'Atricle.cotent', 'Atricle.post_ip', 'Atricle.post_ip.asn', 'Atricle.post_ip.asn_date', 'Atricle.post_ip.asn_registry', 'Atricle.post_ip.asn_cidr', 'Atricle.post_ip.asn_country_code', 'Atricle.post_ip.asn_description', 'Article.post_datetime', 'Article.last_modified_time' ]] push_rows = [[ 'Push.article_web_id', 'Push.username', 'Push.tag', 'Push.content', 'Push.ip', 'Push.ip.asn', 'Push.ip.asn_cidr', 'Push.ip.asn_country_code', 'Push.ip.asn_date', 'Push.ip.asn_description', 'Push.ip.asn_registry', 'Push.datatime' ]] user_rows = [[ 'User.username', 'User.login_times', 'User.valid_article_count', 'User.last_login_datetime', 'User.last_login_ip', 'User.last_login_ip.asn', 'User.last_login_ip.asn_date', 'User.last_login_ip.asn_registry', 'User.last_login_ip.asn_cidr', 'User.last_login_ip.asn_country_code', 'User.last_login_ip.asn_description' ]] data = OrderedDict() article_list = self.db_session.query(Article).order_by( Article.post_datetime).all() for article in article_list: article_row = [ article.web_id, article.board.name or '', article.user.username or '' ] last_history = article.history[0] article_row += [ last_history.title or '', last_history.content or '' ] article_ip_asn = self.db_session.query(IpAsn).filter_by( ip=article.post_ip).first() if article_ip_asn: article_row += [ article_ip_asn.ip or '', article_ip_asn.asn or '', str(article_ip_asn.asn_date or ''), article_ip_asn.asn_registry or '', article_ip_asn.asn_cidr or '', article_ip_asn.asn_country_code or '', article_ip_asn.asn_description or '' ] else: article_row += [article.post_ip or '', '', '', '', '', '', ''] article_row += [ str(article.post_datetime or ''), str(last_history.end_at or '') ] article_rows.append(article_row) for push in last_history.push_list: push_row = [article.web_id] push_row += [ push.user.username or '', push.push_tag or '', push.push_content or '' ] push_ip_asn = self.db_session.query(IpAsn).filter_by( ip=push.push_ip).first() if push_ip_asn: push_row += [ push_ip_asn.ip or '', push_ip_asn.asn or '', push_ip_asn.asn_cidr or '', push_ip_asn.asn_country_code or '', str(push_ip_asn.asn_date or ''), push_ip_asn.asn_description or '', push_ip_asn.asn_registry or '' ] else: push_row += [push.push_ip or '', '', '', '', '', '', ''] if push.push_datetime is not None: if isinstance(push.push_datetime, str): push.push_datetime = datetime.strptime( push.push_datetime, '%Y-%m-%d %H:%M:%S') push_row += [push.push_datetime.strftime('%m/%d %H:%M:%S')] else: push_row += [''] push_rows.append(push_row) user_list = self.db_session.query(User).all() for user in user_list: user_row = [] if user.last_record: user_last_record = user.last_record[0] user_ip_asn = self.db_session.query(IpAsn).filter_by( ip=user_last_record.last_login_ip).first() user_row += [ user.username or '', user.login_times or '', user.valid_article_count or '', str(user_last_record.last_login_datetime or ''), user_last_record.last_login_ip or '' ] user_row += [ user_ip_asn.asn or '', str(user_ip_asn.asn_date or ''), user_ip_asn.asn_registry or '', user_ip_asn.asn_cidr or '', user_ip_asn.asn_country_code or '', user_ip_asn.asn_description or '' ] else: user_row += ['', '', '', '', '', '', '', '', '', '', ''] user_rows.append(user_row) data.update({'Article': article_rows}) data.update({'Push': push_rows}) data.update({'User': user_rows}) return data @log('Get Data') def _get_export_json(self): article_rows = [] push_rows = [] user_rows = [] data = OrderedDict() article_list = self.db_session.query(Article).order_by( Article.post_datetime).all() for article in article_list: article_row = { 'Atricle.web_id': article.web_id, 'Article.board': article.board.name or '', 'Atricle.author': article.user.username or '' } last_history = article.history[0] article_row.update({ 'Atricle.title': last_history.title or '', 'Atricle.cotent': last_history.content or '' }) article_ip_asn = self.db_session.query(IpAsn).filter_by( ip=article.post_ip).first() if article_ip_asn: article_row.update({ 'Atricle.post_ip': article_ip_asn.ip or '', 'Atricle.post_ip.asn': (article_ip_asn.asn or ''), 'Atricle.post_ip.asn_date': str(article_ip_asn.asn_date or ''), 'Atricle.post_ip.asn_registry': (article_ip_asn.asn_registry or ''), 'Atricle.post_ip.asn_cidr': (article_ip_asn.asn_cidr or ''), 'Atricle.post_ip.asn_country_code': (article_ip_asn.asn_country_code or ''), 'Atricle.post_ip.asn_description': (article_ip_asn.asn_description or '') }) else: article_row.update({ 'Atricle.post_ip': article.post_ip or '', 'Atricle.post_ip.asn': '', 'Atricle.post_ip.asn_date': '', 'Atricle.post_ip.asn_registry': '', 'Atricle.post_ip.asn_cidr': '', 'Atricle.post_ip.asn_country_code': '', 'Atricle.post_ip.asn_description': '' }) article_row.update({ 'Article.post_datetime': str(article.post_datetime or ''), 'Article.last_modified_time': str(last_history.end_at or '') }) article_rows.append(article_row) for push in last_history.push_list: push_row = { 'Push.article_web_id': article.web_id, 'Push.username': push.user.username or '', 'Push.tag': push.push_tag or '', 'Push.content': push.push_content or '' } push_ip_asn = self.db_session.query(IpAsn).filter_by( ip=push.push_ip).first() if push_ip_asn: push_row.update({ 'Push.ip': push_ip_asn.ip or '', 'Push.ip.asn': push_ip_asn.asn or '', 'Push.ip.asn_cidr': push_ip_asn.asn_cidr or '', 'Push.ip.asn_country_code': push_ip_asn.asn_country_code or '', 'Push.ip.asn_date': str(push_ip_asn.asn_date or ''), 'Push.ip.asn_description': push_ip_asn.asn_description or '', 'Push.ip.asn_registry': push_ip_asn.asn_registry or '' }) else: push_row.update({ 'Push.ip': push.push_ip or '', 'Push.ip.asn': '', 'Push.ip.asn_cidr': '', 'Push.ip.asn_country_code': '', 'Push.ip.asn_date': '', 'Push.ip.asn_description': '', 'Push.ip.asn_registry': '' }) if push.push_datetime is not None: if isinstance(push.push_datetime, str): push.push_datetime = datetime.strptime( push.push_datetime, '%Y-%m-%d %H:%M:%S') push_row.update({ 'Push.datatime': push.push_datetime.strftime('%m/%d %H:%M:%S') }) else: push_row.update({'Push.datatime': ''}) push_rows.append(push_row) user_list = self.db_session.query(User).all() for user in user_list: user_row = {} if user.last_record: user_last_record = user.last_record[0] user_ip_asn = self.db_session.query(IpAsn).filter_by( ip=user_last_record.last_login_ip).first() user_row.update({ 'User.username': user.username or '', 'User.login_times': user.login_times or '', 'User.valid_article_count': user.valid_article_count or '', 'User.last_login_datetime': str(user_last_record.last_login_datetime or ''), 'User.last_login_ip': user_last_record.last_login_ip or '', 'User.last_login_ip.asn': user_ip_asn.asn or '', 'User.last_login_ip.asn_date': str(user_ip_asn.asn_date or ''), 'User.last_login_ip.asn_registry': user_ip_asn.asn_registry or '', 'User.last_login_ip.asn_cidr': user_ip_asn.asn_cidr or '', 'User.last_login_ip.asn_country_code': user_ip_asn.asn_country_code or '', 'User.last_login_ip.asn_description': user_ip_asn.asn_description or '' }) else: user_row.update({ 'User.username': user.username or '', 'User.login_times': '', 'User.valid_article_count': '', 'User.last_login_datetime': '', 'User.last_login_ip': '', 'User.last_login_ip.asn': '', 'User.last_login_ip.asn_date': '', 'User.last_login_ip.asn_registry': '', 'User.last_login_ip.asn_cidr': '', 'User.last_login_ip.asn_country_code': '', 'User.last_login_ip.asn_description': '' }) user_rows.append(user_row) data.update({'Article': article_rows}) data.update({'Push': push_rows}) data.update({'User': user_rows}) return data @log('Export Json') def _export_json(self): data = self._get_export_json() output_filename = 'Ptt_report_{export_datetime}'.format( export_datetime=datetime.now().strftime('%Y-%m-%d')) json_path = os.path.join( self.output_folder, '{prefix}{filename}.{file_format}'.format( prefix=self.output_prefix, filename=output_filename, file_format=self.file_format.name)) with open(json_path, 'w') as jsonfile: json.dump(data, jsonfile, indent=4, sort_keys=True) @log('Export CSV') def _export_csv(self): data = self._get_export_rows() for (sheet, rows) in data.items(): output_filename = 'Ptt_{sheet}_report_{export_datetime}'.format( sheet=sheet, export_datetime=datetime.now().strftime('%Y-%m-%d')) csv_path = os.path.join( self.output_folder, '{prefix}{filename}.{file_format}'.format( prefix=self.output_prefix, filename=output_filename, file_format=self.file_format.name)) with open(csv_path, 'w', encoding='utf-8') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',') for row in rows: csvwriter.writerow(row) @log('Export Ods') def _export_ods(self): output_filename = 'Ptt_report_{export_datetime}'.format( export_datetime=datetime.now().strftime('%Y-%m-%d')) output_path = os.path.join( self.output_folder, '{prefix}{filename}.{file_format}'.format( prefix=self.output_prefix, filename=output_filename, file_format=self.file_format.name)) data = self._get_export_rows() save_data(output_path, data) @log() def go(self, arguments: Dict[str, str]): self._init_helper(arguments) if self.file_format == ExportFormat.ods: self._export_ods() elif self.file_format == ExportFormat.csv: self._export_csv() elif self.file_format == ExportFormat.json: self._export_json() else: raise ValueError('File format error.')