def get_newest_index( api, index_type: int, board: str = None, # BBS search_type: int = 0, search_condition: str = None, search_list: list = None) -> int: if index_type == data_type.index_type.BBS: check_value.check(api.config, str, 'Board', board) api._check_board(board) check_value.check(api.config, int, 'SearchType', search_type, value_class=data_type.post_search_type) if search_condition is not None: check_value.check(api.config, str, 'SearchCondition', search_condition) if search_list is not None: check_value.check(api.config, list, 'search_list', search_list) check_value.check(api.config, int, 'SearchType', search_type) cmd_list = [] cmd_list.append(command.GoMainMenu) cmd_list.append('qs') cmd_list.append(board) cmd_list.append(command.Enter) cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit( i18n.AnyKeyContinue, '任意鍵', response=' ', ), connect_core.TargetUnit([ '動畫播放中', ], '互動式動畫播放中', response=command.Ctrl_C, log_level=log.level.DEBUG), connect_core.TargetUnit([ '進板成功', ], screens.Target.InBoard, break_detect=True, log_level=log.level.DEBUG), ] index = api.connect_core.send(cmd, target_list) cmd_list = [] normal_newest_index = -1 if search_condition is not None: normal_newest_index = get_newest_index(api, index_type, board=board) if search_type == data_type.post_search_type.KEYWORD: cmd_list.append('/') elif search_type == data_type.post_search_type.AUTHOR: cmd_list.append('a') elif search_type == data_type.post_search_type.PUSH: cmd_list.append('Z') elif search_type == data_type.post_search_type.MARK: cmd_list.append('G') elif search_type == data_type.post_search_type.MONEY: cmd_list.append('A') cmd_list.append(search_condition) cmd_list.append(command.Enter) if search_list is not None: if normal_newest_index == -1: normal_newest_index = get_newest_index(api, index_type, board=board) for search_type_, search_condition_ in search_list: if search_type_ == data_type.post_search_type.KEYWORD: cmd_list.append('/') elif search_type_ == data_type.post_search_type.AUTHOR: cmd_list.append('a') elif search_type_ == data_type.post_search_type.PUSH: cmd_list.append('Z') elif search_type_ == data_type.post_search_type.MARK: cmd_list.append('G') elif search_type_ == data_type.post_search_type.MONEY: cmd_list.append('A') cmd_list.append(search_condition_) cmd_list.append(command.Enter) cmd_list.append('1') cmd_list.append(command.Enter) cmd_list.append('$') cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit(i18n.NoPost, '沒有文章...', break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit(i18n.Success, screens.Target.InBoard, break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit(i18n.Success, screens.Target.InBoardWithCursor, break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit(i18n.NoSuchBoard, screens.Target.MainMenu_Exiting, exceptions_=exceptions.NoSuchBoard( api.config, board)), ] index = api.connect_core.send(cmd, target_list) if index < 0: # OriScreen = api.connect_core.getScreenQueue()[-1] # print(OriScreen) raise exceptions.NoSuchBoard(api.config, board) if index == 0: return 0 newest_index = _get_newest_index(api) if normal_newest_index == newest_index: raise exceptions.NoSearchResult() elif index_type == data_type.index_type.WEB: # web _NewestIndex = None newest_index = 0 _url = 'https://www.ptt.cc/bbs/' url = _url + board r = requests.get(url, cookies={'over18': '1'}) if r.status_code != requests.codes.ok: raise exceptions.NoSuchBoard(api.config, board) soup = BeautifulSoup(r.text, 'html.parser') for index, data in enumerate( soup.select('div.btn-group.btn-group-paging a')): text = data.text herf = data.get('href') if '上頁' in text: _NewestIndex = herf.split('index')[1].split('.')[0] # print("_NewestIndex: " + _NewestIndex) _NewestIndex = int(_NewestIndex) if _NewestIndex is None: raise exceptions.UnknownError('') newest_index = (_NewestIndex) + 1 elif index_type == data_type.index_type.MAIL: cmd_list = [] cmd_list.append(command.GoMainMenu) cmd_list.append(command.Ctrl_Z) cmd_list.append('m') cmd_list.append(command.Ctrl_F * 50) cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit( i18n.MailBox, screens.Target.InMailBox, break_detect=True, # log_level=log.level.DEBUG ), connect_core.TargetUnit(i18n.NoMail, screens.Target.CursorToGoodbye, break_detect=True, log_level=log.level.DEBUG), ] def get_index(api): current_capacity, _ = _api_util.get_mailbox_capacity(api) last_screen = api.connect_core.get_screen_queue()[-1] cursor_line = [ x for x in last_screen.split('\n') if x.strip().startswith(api.cursor) ][0] # print(cursor_line) list_index = int(re.compile('(\d+)').search(cursor_line).group(0)) if list_index > current_capacity: newest_index = list_index else: newest_index = current_capacity return newest_index for _ in range(3): index = api.connect_core.send( cmd, target_list, ) if index == 0: newest_index = get_index(api) break newest_index = 0 return newest_index
def get_post( api, board: str, post_aid: str = None, post_index: int = 0, search_type: int = 0, search_condition: str = None, query: bool = False) -> data_type.PostInfo: cmd_list = [] cmd_list.append(command.GoMainMenu) cmd_list.append('qs') cmd_list.append(board) cmd_list.append(command.Enter) cmd_list.append(command.Ctrl_C * 2) cmd_list.append(command.Space) if post_aid is not None: cmd_list.append('#' + post_aid) elif post_index != 0: if search_condition is not None: if search_type == data_type.post_search_type.KEYWORD: cmd_list.append('/') elif search_type == data_type.post_search_type.AUTHOR: cmd_list.append('a') elif search_type == data_type.post_search_type.PUSH: cmd_list.append('Z') elif search_type == data_type.post_search_type.MARK: cmd_list.append('G') elif search_type == data_type.post_search_type.MONEY: cmd_list.append('A') cmd_list.append(search_condition) cmd_list.append(command.Enter) cmd_list.append(str(post_index)) cmd_list.append(command.Enter) cmd_list.append(command.QueryPost) cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit( [ i18n.CatchPost, i18n.Success, ], screens.Target.QueryPost, break_detect=True, refresh=False, log_level=log.level.DEBUG ), connect_core.TargetUnit( [ i18n.PostDeleted, i18n.Success, ], screens.Target.InBoard, break_detect=True, log_level=log.level.DEBUG ), connect_core.TargetUnit( i18n.NoSuchBoard, screens.Target.MainMenu_Exiting, exceptions_=exceptions.NoSuchBoard(api.config, board) ), ] index = api.connect_core.send(cmd, target_list) ori_screen = api.connect_core.get_screen_queue()[-1] post_author = None post_title = None if index < 0 or index == 1: # 文章被刪除 log.log(api.config, log.level.DEBUG, i18n.PostDeleted) log.show_value( api.config, log.level.DEBUG, 'OriScreen', ori_screen ) cursor_line = [line for line in ori_screen.split( '\n') if line.startswith(api.cursor)] if len(cursor_line) != 1: raise exceptions.UnknownError(ori_screen) cursor_line = cursor_line[0] log.show_value( api.config, log.level.DEBUG, 'CursorLine', cursor_line ) pattern = re.compile('[\d]+\/[\d]+') pattern_result = pattern.search(cursor_line) if pattern_result is None: list_date = None else: list_date = pattern_result.group(0) list_date = list_date[-5:] pattern = re.compile('\[[\w]+\]') pattern_result = pattern.search(cursor_line) if pattern_result is not None: post_del_status = data_type.post_delete_status.AUTHOR else: pattern = re.compile('<[\w]+>') pattern_result = pattern.search(cursor_line) post_del_status = data_type.post_delete_status.MODERATOR # > 79843 9/11 - □ (本文已被吃掉)< # > 76060 8/28 - □ (本文已被刪除) [weida7332] # print(f'O=>{CursorLine}<') if pattern_result is not None: post_author = pattern_result.group(0)[1:-1] else: post_author = None post_del_status = data_type.post_delete_status.UNKNOWN log.show_value(api.config, log.level.DEBUG, 'ListDate', list_date) log.show_value(api.config, log.level.DEBUG, 'PostAuthor', post_author) log.show_value(api.config, log.level.DEBUG, 'post_del_status', post_del_status) return data_type.PostInfo( board=board, author=post_author, list_date=list_date, delete_status=post_del_status, format_check=True ) elif index == 0: lock_post = False try: cursor_line = [line for line in ori_screen.split( '\n') if line.strip().startswith(api.cursor)][0] except Exception as e: print(api.cursor) print(ori_screen) raise e post_author = cursor_line if '□' in post_author: post_author = post_author[:post_author.find('□')].strip() elif 'R:' in post_author: post_author = post_author[:post_author.find('R:')].strip() elif ' 轉 ' in post_author: post_author = post_author[:post_author.find('轉')].strip() elif ' 鎖 ' in post_author: post_author = post_author[:post_author.find('鎖')].strip() lock_post = True post_author = post_author[post_author.rfind(' '):].strip() post_title = cursor_line if ' □ ' in post_title: post_title = post_title[post_title.find('□') + 1:].strip() elif ' R:' in post_title: post_title = post_title[post_title.find('R:'):].strip() elif ' 轉 ' in post_title: # print(f'[{PostTitle}]=========>') post_title = post_title[post_title.find('轉') + 1:].strip() post_title = f'Fw: {post_title}' # print(f'=========>[{PostTitle}]') elif ' 鎖 ' in post_title: post_title = post_title[post_title.find('鎖') + 1:].strip() ori_screen_temp = ori_screen[ori_screen.find('┌──────────'):] ori_screen_temp = ori_screen_temp[:ori_screen_temp.find( '└─────────────') ] aid_line = [line for line in ori_screen.split( '\n') if line.startswith('│ 文章代碼(AID)')] if len(aid_line) == 1: aid_line = aid_line[0] pattern = re.compile('#[\w|-]+') pattern_result = pattern.search(aid_line) post_aid = pattern_result.group(0)[1:] pattern = re.compile('文章網址: https:[\S]+html') pattern_result = pattern.search(ori_screen_temp) if pattern_result is None: post_web = None else: post_web = pattern_result.group(0)[6:] pattern = re.compile('這一篇文章值 [\d]+ Ptt幣') pattern_result = pattern.search(ori_screen_temp) if pattern_result is None: # 特殊文章無價格 post_money = -1 else: post_money = pattern_result.group(0)[7:] post_money = post_money[:post_money.find(' ')] post_money = int(post_money) pattern = re.compile('[\d]+\/[\d]+') pattern_result = pattern.search(cursor_line) if pattern_result is None: list_date = None else: list_date = pattern_result.group(0) list_date = list_date[-5:] # print(list_date) # > 7485 9 8/09 CodingMan □ [閒聊] PTT Library 更新 # > 79189 M 1 9/17 LittleCalf □ [公告] 禁言退文公告 # >781508 +爆 9/17 jodojeda □ [新聞] 國人吃魚少 學者:應把吃魚當成輕鬆愉快 # >781406 +X1 9/17 kingofage111 R: [申請] ReDmango 請辭Gossiping板主職務 if post_index == 0: pattern = re.compile('[\d]+') pattern_result = pattern.search(cursor_line) if pattern_result is not None: post_index = int(pattern_result.group(0)) push_number = cursor_line # print(f'2>{push_number}<') push_number = push_number[7:11] # print(PushNumber) push_number = push_number.split(' ') # print(PushNumber) push_number = list(filter(None, push_number)) # print(PushNumber) if len(push_number) == 0: push_number = None else: push_number = push_number[-1] # print(PushNumber) if push_number.startswith('+') or push_number.startswith('~'): push_number = push_number[1:] # print(PushNumber) if push_number.lower().startswith('m'): push_number = push_number[1:] # print(PushNumber) if push_number.lower().startswith('!'): push_number = push_number[1:] if push_number.lower().startswith('s'): push_number = push_number[1:] if push_number.lower().startswith('='): push_number = push_number[1:] if len(push_number) == 0: push_number = None # print(PushNumber) log.show_value(api.config, log.level.DEBUG, 'PostAuthor', post_author) log.show_value(api.config, log.level.DEBUG, 'PostTitle', post_title) log.show_value(api.config, log.level.DEBUG, 'PostAID', post_aid) log.show_value(api.config, log.level.DEBUG, 'PostWeb', post_web) log.show_value(api.config, log.level.DEBUG, 'PostMoney', post_money) log.show_value(api.config, log.level.DEBUG, 'ListDate', list_date) log.show_value(api.config, log.level.DEBUG, 'PushNumber', push_number) if lock_post: post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, format_check=True, push_number=push_number, lock=True, ) return post if query: post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, format_check=True, push_number=push_number, ) return post origin_post, has_control_code = _api_util.get_content(api) if origin_post is None: post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, control_code=has_control_code, format_check=False, push_number=push_number, unconfirmed=api.Unconfirmed, ) return post # print('=' * 20) # print() # print('=' * 20) content_start = '───────────────────────────────────────' content_end = [] content_end.append('--\n※ 發信站: 批踢踢實業坊(ptt.cc)') content_end.append('--\n※ 發信站: 批踢踢兔(ptt2.cc)') content_end.append('--\n※ 發信站: 新批踢踢(ptt2.twbbs.org.tw)') post_author_pattern_new = re.compile('作者 (.+) 看板') post_author_pattern_old = re.compile('作者 (.+)') board_pattern = re.compile('看板 (.+)') post_date = None post_content = [] ip = None location = None push_list = [] # 格式確認,亂改的我也沒辦法Q_Q origin_post_lines = origin_post.split('\n') author_line = origin_post_lines[0] if board.lower() == 'allpost': board_line = author_line[author_line.find(')') + 1:] pattern_result = board_pattern.search(board_line) if pattern_result is not None: board_temp = post_author = pattern_result.group(0) board_temp = board_temp[2:].strip() if len(board_temp) > 0: board = board_temp log.show_value( api.config, log.level.DEBUG, i18n.Board, board ) pattern_result = post_author_pattern_new.search(author_line) if pattern_result is not None: post_author = pattern_result.group(0) post_author = post_author[:post_author.rfind(')') + 1] else: pattern_result = post_author_pattern_old.search(author_line) if pattern_result is None: log.show_value( api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Author ) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post post_author = pattern_result.group(0) post_author = post_author[:post_author.rfind(')') + 1] post_author = post_author[4:].strip() log.show_value( api.config, log.level.DEBUG, i18n.Author, post_author ) post_title_pattern = re.compile('標題 (.+)') title_line = origin_post_lines[1] pattern_result = post_title_pattern.search(title_line) if pattern_result is None: log.show_value( api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Title ) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post post_title = pattern_result.group(0) post_title = post_title[4:].strip() log.show_value( api.config, log.level.DEBUG, i18n.Title, post_title ) post_date_pattern = re.compile('時間 (.+)') date_line = origin_post_lines[2] pattern_result = post_date_pattern.search(date_line) if pattern_result is None: log.show_value( api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Date ) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post post_date = pattern_result.group(0) post_date = post_date[4:].strip() log.show_value( api.config, log.level.DEBUG, i18n.Date, post_date ) content_fail = True if content_start not in origin_post: # print('Type 1') content_fail = True else: post_content = origin_post post_content = post_content[ post_content.find(content_start) + len(content_start) + 1: ] # print('Type 2') # print(f'PostContent [{PostContent}]') for EC in content_end: # + 3 = 把 --\n 拿掉 # print(f'EC [{EC}]') if EC in post_content: content_fail = False post_content = post_content[ :post_content.rfind(EC) + 3 ] origin_post_lines = origin_post[origin_post.find(EC):] # post_content = post_content.strip() origin_post_lines = origin_post_lines.split('\n') break if content_fail: log.show_value( api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Content ) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post log.show_value( api.config, log.level.DEBUG, i18n.Content, post_content ) info_lines = [ line for line in origin_post_lines if line.startswith('※') or line.startswith('◆') ] pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+') pattern_p2 = re.compile('[\d]+-[\d]+-[\d]+-[\d]+') for line in reversed(info_lines): log.show_value( api.config, log.level.DEBUG, 'IP Line', line ) # type 1 # ※ 編輯: CodingMan (111.243.146.98 臺灣) # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 111.243.146.98 (臺灣) # type 2 # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 116.241.32.178 # ※ 編輯: kill77845 (114.136.55.237), 12/08/2018 16:47:59 # type 3 # ※ 發信站: 批踢踢實業坊(ptt.cc) # ◆ From: 211.20.78.69 # ※ 編輯: JCC 來自: 211.20.78.69 (06/20 10:22) # ※ 編輯: JCC (118.163.28.150), 12/03/2015 14:25:35 pattern_result = pattern.search(line) if pattern_result is not None: ip = pattern_result.group(0) location_temp = line[line.find(ip) + len(ip):].strip() location_temp = location_temp.replace('(', '') location_temp = location_temp[:location_temp.rfind(')')] location_temp = location_temp.strip() # print(f'=>[{LocationTemp}]') if ' ' not in location_temp and len(location_temp) > 0: location = location_temp log.show_value(api.config, log.level.DEBUG, 'Location', location) break pattern_result = pattern_p2.search(line) if pattern_result is not None: ip = pattern_result.group(0) ip = ip.replace('-', '.') # print(f'IP -> [{IP}]') break if api.config.host == data_type.host_type.PTT1: if ip is None: log.show_value( api.config, log.level.DEBUG, i18n.SubstandardPost, 'IP' ) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post log.show_value(api.config, log.level.DEBUG, 'IP', ip) push_author_pattern = re.compile('[推|噓|→] [\w| ]+:') push_date_pattern = re.compile('[\d]+/[\d]+ [\d]+:[\d]+') push_ip_pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+') push_list = [] for line in origin_post_lines: if line.startswith('推'): push_type = data_type.push_type.PUSH elif line.startswith('噓 '): push_type = data_type.push_type.BOO elif line.startswith('→ '): push_type = data_type.push_type.ARROW else: continue result = push_author_pattern.search(line) if result is None: # 不符合推文格式 continue push_author = result.group(0)[2:-1].strip() log.show_value(api.config, log.level.DEBUG, [ i18n.Push, i18n.ID, ], push_author ) result = push_date_pattern.search(line) if result is None: continue push_date = result.group(0) log.show_value(api.config, log.level.DEBUG, [ i18n.Push, i18n.Date, ], push_date ) push_ip = None result = push_ip_pattern.search(line) if result is not None: push_ip = result.group(0) log.show_value( api.config, log.level.DEBUG, [ i18n.Push, 'IP', ], push_ip ) push_content = line[ line.find(push_author) + len(push_author): ] # PushContent = PushContent.replace(PushDate, '') if api.config.host == data_type.host_type.PTT1: push_content = push_content[ :push_content.rfind(push_date) ] else: # → CodingMan:What is Ptt? 推 10/04 13:25 push_content = push_content[ :push_content.rfind(push_date) - 2 ] if push_ip is not None: push_content = push_content.replace(push_ip, '') push_content = push_content[ push_content.find(':') + 1: ].strip() log.show_value( api.config, log.level.DEBUG, [ i18n.Push, i18n.Content, ], push_content ) current_push = data_type.PushInfo( push_type, push_author, push_content, push_ip, push_date ) push_list.append(current_push) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=True, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post
def crawl_board( self, crawl_type: int, post_handler, board: str, # BBS版本 start_index: int = 0, end_index: int = 0, start_aid: str = None, end_aid: str = None, search_type: int = 0, search_condition: str = None, query: bool = False, # 網頁版本 start_page: int = 0, end_page: int = 0) -> list: self._one_thread() self.config.log_last_value = None check_value.check(self.config, int, 'crawl_type', crawl_type, value_class=data_type.crawl_type) check_value.check(self.config, str, 'Board', board) if len(board) == 0: raise ValueError( log.merge(self.config, [i18n.Board, i18n.ErrorParameter, board])) if crawl_type == data_type.crawl_type.BBS: if not self._login_status: raise exceptions.Requirelogin(i18n.Requirelogin) check_value.check(self.config, int, 'SearchType', search_type) if search_condition is not None: check_value.check(self.config, str, 'SearchCondition', search_condition) if start_aid is not None: check_value.check(self.config, str, 'StartAID', start_aid) if end_aid is not None: check_value.check(self.config, str, 'EndAID', end_aid) if (start_aid is not None or end_aid is not None) and \ (start_index != 0 or end_index != 0): raise ValueError( log.merge( self.config, ['AID', 'Index', i18n.ErrorParameter, i18n.BothInput])) if (start_aid is not None or end_aid is not None) and \ (search_condition is not None): raise ValueError( log.merge(self.config, [ 'AID', 'SearchCondition', i18n.ErrorParameter, i18n.BothInput ])) if search_type == data_type.post_search_type.PUSH: try: S = int(search_condition) except ValueError: raise ValueError( log.merge(self.config, [ 'SearchCondition', i18n.ErrorParameter, ])) if not (-100 <= S <= 110): raise ValueError( log.merge(self.config, [ 'SearchCondition', i18n.ErrorParameter, ])) if start_index != 0: newest_index = self._get_newest_index( data_type.index_type.BBS, board=board, search_type=search_type, search_condition=search_condition) check_value.check_index_range(self.config, 'start_index', start_index, 'end_index', end_index, max_value=newest_index) elif start_aid is not None and end_aid is not None: start_index = self.get_post(board, post_aid=start_aid, query=True).index end_index = self.get_post(board, post_aid=end_aid, query=True).index check_value.check_index_range(self.config, 'start_index', start_index, 'end_index', end_index) else: raise ValueError( log.merge(self.config, [i18n.ErrorParameter, i18n.NoInput])) log.show_value(self.config, log.level.DEBUG, 'StartIndex', start_index) log.show_value(self.config, log.level.DEBUG, 'EndIndex', end_index) error_post_list = [] del_post_list = [] if self.config.log_level == log.level.INFO: PB = progressbar.ProgressBar(max_value=end_index - start_index + 1, redirect_stdout=True) for index in range(start_index, end_index + 1): for i in range(2): need_continue = False post = None try: post = self._get_post( board, post_index=index, search_type=search_type, search_condition=search_condition, query=query) except exceptions.ParseError as e: if i == 1: raise e need_continue = True except exceptions.UnknownError as e: if i == 1: raise e need_continue = True except exceptions.NoSuchBoard as e: if i == 1: raise e need_continue = True except exceptions.NoMatchTargetError as e: if i == 1: raise e need_continue = True except exceptions.ConnectionClosed as e: if i == 1: raise e log.log(self.config, log.level.INFO, i18n.RestoreConnection) self._login(self._ID, self._Password, self.config.kick_other_login) need_continue = True except exceptions.UseTooManyResources as e: if i == 1: raise e log.log(self.config, log.level.INFO, i18n.RestoreConnection) self._login(self._ID, self._Password, self.config.kick_other_login) need_continue = True if post is None: need_continue = True elif not post.pass_format_check: need_continue = True if need_continue: log.log(self.config, log.level.DEBUG, 'Wait for retry repost') time.sleep(0.1) continue break if self.config.log_level == log.level.INFO: PB.update(index - start_index) if post is None: error_post_list.append(index) continue if not post.pass_format_check: if post.aid is not None: error_post_list.append(post.aid) else: error_post_list.append(index) continue if post.delete_status != data_type.post_delete_status.NOT_DELETED: del_post_list.append(index) post_handler(post) if self.config.log_level == log.level.INFO: PB.finish() return error_post_list, del_post_list else: if self.config.host == data_type.host_type.PTT2: raise exceptions.HostNotSupport( lib_util.get_current_func_name()) # 網頁版本爬蟲 # https://www.ptt.cc/bbs/index.html # 1. 取得總共有幾頁 MaxPage newest_index = self._get_newest_index(data_type.index_type.WEB, board=board) # 2. 檢查 StartPage 跟 EndPage 有沒有在 1 ~ MaxPage 之間 check_value.check_index_range(self.config, 'StartPage', start_page, 'EndPage', end_page, max_value=newest_index) # 3. 把每篇文章(包括被刪除文章)欄位解析出來組合成 data_type.PostInfo error_post_list = [] del_post_list = [] # PostAID = "" _url = 'https://www.ptt.cc/bbs/' index = str(newest_index) if self.config.log_level == log.level.INFO: PB = progressbar.ProgressBar(max_value=end_page - start_page + 1, redirect_stdout=True) def deleted_post(post_title): if post_title.startswith('('): if '本文' in post_title: return data_type.post_delete_status.AUTHOR elif post_title.startswith('(已被'): return data_type.post_delete_status.MODERATOR else: return data_type.post_delete_status.UNKNOWN else: return data_type.post_delete_status.NOT_DELETED for index in range(start_page, newest_index + 1): log.show_value(self.config, log.level.DEBUG, 'CurrentPage', index) url = _url + board + '/index' + str(index) + '.html' r = requests.get(url, cookies={'over18': '1'}) if r.status_code != requests.codes.ok: raise exceptions.NoSuchBoard(self.config, board) soup = BeautifulSoup(r.text, 'html.parser') for div in soup.select('div.r-ent'): web = div.select('div.title a') post = { 'author': div.select('div.author')[0].text, 'title': div.select('div.title')[0].text.strip('\n').strip(), 'web': web[0].get('href') if web else '' } if post['title'].startswith('('): del_post_list.append(post['title']) if post['title'].startswith('(本文'): if '[' in post['title']: post['author'] = post['title'].split( '[')[1].split(']')[0] else: post['author'] = post['title'].split( '<')[1].split('>')[0] else: post['author'] = post['title'].split('<')[1].split( '>')[0] post = data_type.PostInfo( board=board, author=post['author'], title=post['title'], web_url='https://www.ptt.cc' + post['web'], delete_status=deleted_post(post['title'])) post_handler(post) if self.config.log_level == log.level.INFO: PB.update(index - start_page) log.show_value(self.config, log.level.DEBUG, 'DelPostList', del_post_list) # 4. 把組合出來的 Post 塞給 handler # 5. 顯示 progress bar if self.config.log_level == log.level.INFO: PB.finish() return error_post_list, del_post_list
def get_newest_index( api, index_type: int, board: str = None, # BBS search_type: int = 0, search_condition: str = None) -> int: if index_type == data_type.index_type.BBS: api._check_board(board) check_value.check(api.config, int, 'SearchType', search_type, value_class=data_type.post_search_type) if search_condition is not None: check_value.check(api.config, str, 'SearchCondition', search_condition) check_value.check(api.config, int, 'SearchType', search_type) cmd_list = [] cmd_list.append(command.GoMainMenu) cmd_list.append('qs') cmd_list.append(board) cmd_list.append(command.Enter) cmd_list.append(command.Ctrl_C * 2) cmd_list.append(command.Space) if search_condition is not None: if search_type == data_type.post_search_type.KEYWORD: cmd_list.append('/') elif search_type == data_type.post_search_type.AUTHOR: cmd_list.append('a') elif search_type == data_type.post_search_type.PUSH: cmd_list.append('Z') elif search_type == data_type.post_search_type.MARK: cmd_list.append('G') elif search_type == data_type.post_search_type.MONEY: cmd_list.append('A') cmd_list.append(search_condition) cmd_list.append(command.Enter) cmd_list.append('1') cmd_list.append(command.Enter) cmd_list.append('$') cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit(i18n.NoPost, '沒有文章...', break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit(i18n.Success, screens.Target.InBoard, break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit(i18n.Success, screens.Target.InBoardWithCursor, break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit(i18n.NoSuchBoard, screens.Target.MainMenu_Exiting, exceptions_=exceptions.NoSuchBoard( api.config, board)), ] index = api.connect_core.send(cmd, target_list) if index < 0: # OriScreen = api.connect_core.getScreenQueue()[-1] # print(OriScreen) raise exceptions.NoSuchBoard(api.config, board) if index == 0: return 0 last_screen = api.connect_core.get_screen_queue()[-1] all_index = re.findall(r'\d+ ', last_screen) if len(all_index) == 0: print(last_screen) raise exceptions.UnknownError(i18n.UnknownError) all_index = list(map(int, all_index)) all_index.sort(reverse=True) max_check_range = 6 newest_index = 0 for IndexTemp in all_index: need_continue = True if IndexTemp > max_check_range: check_range = max_check_range else: check_range = IndexTemp for i in range(1, check_range): if str(IndexTemp - i) not in last_screen: need_continue = False break if need_continue: log.show_value(api.config, log.level.DEBUG, i18n.FindNewestIndex, IndexTemp) newest_index = IndexTemp break if newest_index == 0: screens.show(api.config, api.connect_core.get_screen_queue()) raise exceptions.UnknownError(i18n.UnknownError) elif data_type.index_type.WEB: # web _NewestIndex = None newest_index = 0 _url = 'https://www.ptt.cc/bbs/' url = _url + board r = requests.get(url, cookies={'over18': '1'}) if r.status_code != requests.codes.ok: raise exceptions.NoSuchBoard(api.config, board) soup = BeautifulSoup(r.text, 'html.parser') for index, data in enumerate( soup.select('div.btn-group.btn-group-paging a')): text = data.text herf = data.get('href') if '上頁' in text: _NewestIndex = herf.split('index')[1].split('.')[0] # print("_NewestIndex: " + _NewestIndex) _NewestIndex = int(_NewestIndex) if _NewestIndex is None: raise exceptions.UnknownError('') newest_index = (_NewestIndex) + 1 return newest_index
def get_board_info(api, board: str, call_by_others: bool) -> None: cmd_list = [] cmd_list.append(command.GoMainMenu) cmd_list.append('qs') cmd_list.append(board) cmd_list.append(command.Enter) cmd_list.append(command.Ctrl_C * 2) cmd_list.append(command.Space) cmd = ''.join(cmd_list) if call_by_others: log_level = log.level.DEBUG else: log_level = log.level.INFO target_list = [ connect_core.TargetUnit(i18n.IntoBoard, ['文章選讀', '進板畫面'], break_detect=True, log_level=log_level), ] api.connect_core.send(cmd, target_list) ori_screen = api.connect_core.get_screen_queue()[-1] # print(OriScreen) nuser = ori_screen.split('\n')[2] # print(Nuser) if '[靜]' in nuser: online_user = 0 else: if '編號' not in nuser or '人氣' not in nuser: raise exceptions.NoSuchBoard(api.config, board) pattern = re.compile('[\d]+') r = pattern.search(nuser) if r is None: raise exceptions.NoSuchBoard(api.config, board) # 減一是把自己本身拿掉 online_user = int(r.group(0)) - 1 log.show_value(api.config, log.level.DEBUG, '人氣', online_user) target_list = [ connect_core.TargetUnit(i18n.ReadingBoardInfo, '任意鍵繼續', break_detect=True, log_level=log_level), ] api.connect_core.send('i', target_list) ori_screen = api.connect_core.get_screen_queue()[-1] # print(OriScreen) p = re.compile('《(.+)》看板設定') r = p.search(ori_screen) if r is not None: boardname = r.group(0)[1:-5].strip() log.show_value(api.config, log.level.DEBUG, '看板名稱', boardname) if boardname != board: raise exceptions.NoSuchBoard(api.config, board) p = re.compile('中文敘述: (.+)') r = p.search(ori_screen) if r is not None: chinese_des = r.group(0)[5:].strip() log.show_value(api.config, log.level.DEBUG, '中文敘述', chinese_des) p = re.compile('板主名單: (.+)') r = p.search(ori_screen) if r is not None: moderator_line = r.group(0)[5:].strip() moderators = moderator_line.split('/') log.show_value(api.config, log.level.DEBUG, '板主名單', moderators) open_status = ('公開狀態(是否隱形): 公開' in ori_screen) log.show_value(api.config, log.level.DEBUG, '公開狀態', open_status) into_top_ten_when_hide = ('隱板時 可以 進入十大排行榜' in ori_screen) log.show_value(api.config, log.level.DEBUG, '隱板時可以進入十大排行榜', into_top_ten_when_hide) non_board_members_post = ('開放 非看板會員發文' in ori_screen) log.show_value(api.config, log.level.DEBUG, '非看板會員發文', non_board_members_post) reply_post = ('開放 回應文章' in ori_screen) log.show_value(api.config, log.level.DEBUG, '回應文章', reply_post) self_del_post = ('開放 自刪文章' in ori_screen) log.show_value(api.config, log.level.DEBUG, '自刪文章', self_del_post) push_post = ('開放 推薦文章' in ori_screen) log.show_value(api.config, log.level.DEBUG, '推薦文章', push_post) boo_post = ('開放 噓文' in ori_screen) log.show_value(api.config, log.level.DEBUG, '噓文', boo_post) # 限制 快速連推文章, 最低間隔時間: 5 秒 # 開放 快速連推文章 fast_push = ('開放 快速連推文章' in ori_screen) log.show_value(api.config, log.level.DEBUG, '快速連推文章', fast_push) if not fast_push: p = re.compile('最低間隔時間: [\d]+') r = p.search(ori_screen) if r is not None: min_interval = r.group(0)[7:].strip() min_interval = int(min_interval) else: min_interval = 0 log.show_value(api.config, log.level.DEBUG, '最低間隔時間', min_interval) else: min_interval = 0 # 推文時 自動 記錄來源 IP # 推文時 不會 記錄來源 IP push_record_ip = ('推文時 自動 記錄來源 IP' in ori_screen) log.show_value(api.config, log.level.DEBUG, '記錄來源 IP', push_record_ip) # 推文時 對齊 開頭 # 推文時 不用對齊 開頭 push_aligned = ('推文時 對齊 開頭' in ori_screen) log.show_value(api.config, log.level.DEBUG, '對齊開頭', push_aligned) # 板主 可 刪除部份違規文字 moderator_can_del_illegal_content = ('板主 可 刪除部份違規文字' in ori_screen) log.show_value(api.config, log.level.DEBUG, '板主可刪除部份違規文字', moderator_can_del_illegal_content) # 轉錄文章 會 自動記錄,且 需要 發文權限 tran_post_auto_recorded_and_require_post_permissions = ( '轉錄文章 會 自動記錄,且 需要 發文權限' in ori_screen) log.show_value(api.config, log.level.DEBUG, '轉錄文章 會 自動記錄,且 需要 發文權限', tran_post_auto_recorded_and_require_post_permissions) cool_mode = ('未 設為冷靜模式' not in ori_screen) log.show_value(api.config, log.level.DEBUG, '冷靜模式', cool_mode) require18 = ('禁止 未滿十八歲進入' in ori_screen) log.show_value(api.config, log.level.DEBUG, '禁止未滿十八歲進入', require18) p = re.compile('登入次數 [\d]+ 次以上') r = p.search(ori_screen) if r is not None: require_login_time = r.group(0).split(' ')[1] require_login_time = int(require_login_time) else: require_login_time = 0 log.show_value(api.config, log.level.DEBUG, '發文限制登入次數', require_login_time) p = re.compile('退文篇數 [\d]+ 篇以下') r = p.search(ori_screen) if r is not None: require_illegal_post = r.group(0).split(' ')[1] require_illegal_post = int(require_illegal_post) else: require_illegal_post = 0 log.show_value(api.config, log.level.DEBUG, '發文限制退文篇數', require_illegal_post) board_info = data_type.BoardInfo( boardname, online_user, chinese_des, moderators, open_status, into_top_ten_when_hide, non_board_members_post, reply_post, self_del_post, push_post, boo_post, fast_push, min_interval, push_record_ip, push_aligned, moderator_can_del_illegal_content, tran_post_auto_recorded_and_require_post_permissions, cool_mode, require18, require_login_time, require_illegal_post, ) return board_info
def get_newest_index( api, index_type: int, search_type: int = 0, search_condition: str = None, search_list: list = None, # BBS board: str = None) -> int: if index_type == data_type.index_type.BBS: check_value.check(api.config, str, 'Board', board) api._check_board(board) api._goto_board(board) cmd_list, normal_newest_index = _api_util.get_search_condition_cmd( api, index_type, search_type, search_condition, search_list, board) cmd_list.append('1') cmd_list.append(command.Enter) cmd_list.append('$') cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit( i18n.NoPost, '沒有文章...', break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit( i18n.Success, screens.Target.InBoard, break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit( i18n.Success, screens.Target.InBoardWithCursor, break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit( i18n.NoSuchBoard, screens.Target.MainMenu_Exiting, exceptions_=exceptions.NoSuchBoard(api.config, board)), ] index = api.connect_core.send(cmd, target_list) if index < 0: # OriScreen = api.connect_core.getScreenQueue()[-1] # print(OriScreen) raise exceptions.NoSuchBoard(api.config, board) if index == 0: return 0 newest_index = _get_newest_index(api) if normal_newest_index == newest_index: raise exceptions.NoSearchResult() elif index_type == data_type.index_type.MAIL: cmd_list = list() cmd_list.append(command.GoMainMenu) cmd_list.append(command.Ctrl_Z) cmd_list.append('m') _cmd_list, normal_newest_index = _api_util.get_search_condition_cmd( api, index_type, search_type, search_condition, search_list, board) # print('normal_newest_index', normal_newest_index) cmd_list.extend(_cmd_list) cmd_list.append(command.Ctrl_F * 50) cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit( i18n.MailBox, screens.Target.InMailBox, break_detect=True), connect_core.TargetUnit( i18n.NoMail, screens.Target.CursorToGoodbye, break_detect=True, log_level=log.level.DEBUG), ] def get_index(api): current_capacity, _ = _api_util.get_mailbox_capacity(api) last_screen = api.connect_core.get_screen_queue()[-1] cursor_line = [x for x in last_screen.split('\n') if x.strip().startswith(api.cursor)][0] # print('---->', cursor_line) list_index = int(re.compile('(\d+)').search(cursor_line).group(0)) # print('----> list_index', list_index) # print('----> current_capacity', current_capacity) if search_type == 0 and search_list is None: if list_index > current_capacity: newest_index = list_index else: newest_index = current_capacity else: newest_index = list_index return newest_index for i in range(3): index = api.connect_core.send( cmd, target_list) # print('index', index) # last_screen = api.connect_core.get_screen_queue()[-1] # print(last_screen) if index == 0: newest_index = get_index(api) if normal_newest_index == newest_index: if i == 2: raise exceptions.NoSearchResult() else: continue break newest_index = 0 return newest_index
def get_board_info( api, board: str, get_post_kind: bool, get_board_limit: bool, call_by_others: bool) -> None: if call_by_others: log_level = log.level.DEBUG else: log_level = log.level.INFO api._goto_board(board, refresh=True) ori_screen = api.connect_core.get_screen_queue()[-1] nuser = None for line in ori_screen.split('\n'): if '編號' not in line: continue if '日 期' not in line: continue if '人氣' not in line: continue nuser = line break if nuser is None: raise exceptions.NoSuchBoard(api.config, board) # print('------------------------') # print('nuser', nuser) # print('------------------------') if '[靜]' in nuser: online_user = 0 else: if '編號' not in nuser or '人氣' not in nuser: raise exceptions.NoSuchBoard(api.config, board) pattern = re.compile('[\d]+') r = pattern.search(nuser) if r is None: raise exceptions.NoSuchBoard(api.config, board) # 減一是把自己本身拿掉 online_user = int(r.group(0)) - 1 log.show_value( api.config, log.level.DEBUG, '人氣', online_user) target_list = [ connect_core.TargetUnit( i18n.ReadingBoardInfo, '任意鍵繼續', break_detect=True, log_level=log_level ), ] api.connect_core.send( 'i', target_list) ori_screen = api.connect_core.get_screen_queue()[-1] # print(ori_screen) p = re.compile('《(.+)》看板設定') r = p.search(ori_screen) if r is not None: boardname = r.group(0)[1:-5].strip() log.show_value( api.config, log.level.DEBUG, '看板名稱', boardname) if boardname.lower() != board.lower(): raise exceptions.NoSuchBoard(api.config, board) p = re.compile('中文敘述: (.+)') r = p.search(ori_screen) if r is not None: chinese_des = r.group(0)[5:].strip() log.show_value( api.config, log.level.DEBUG, '中文敘述', chinese_des) p = re.compile('板主名單: (.+)') r = p.search(ori_screen) if r is not None: moderator_line = r.group(0)[5:].strip() if '(無)' in moderator_line: moderators = list() else: moderators = moderator_line.split('/') for moderator in moderators.copy(): check = True for c in moderator: if len(c.encode('big5')) > 1: check = False break if not check: moderators.remove(moderator) log.show_value( api.config, log.level.DEBUG, '板主名單', moderators) open_status = ('公開狀態(是否隱形): 公開' in ori_screen) log.show_value( api.config, log.level.DEBUG, '公開狀態', open_status) into_top_ten_when_hide = ( '隱板時 可以 進入十大排行榜' in ori_screen) log.show_value( api.config, log.level.DEBUG, '隱板時可以進入十大排行榜', into_top_ten_when_hide) non_board_members_post = ('開放 非看板會員發文' in ori_screen) log.show_value( api.config, log.level.DEBUG, '非看板會員發文', non_board_members_post) reply_post = ('開放 回應文章' in ori_screen) log.show_value( api.config, log.level.DEBUG, '回應文章', reply_post) self_del_post = ('開放 自刪文章' in ori_screen) log.show_value( api.config, log.level.DEBUG, '自刪文章', self_del_post) push_post = ('開放 推薦文章' in ori_screen) log.show_value( api.config, log.level.DEBUG, '推薦文章', push_post) boo_post = ('開放 噓文' in ori_screen) log.show_value( api.config, log.level.DEBUG, '噓文', boo_post) # 限制 快速連推文章, 最低間隔時間: 5 秒 # 開放 快速連推文章 fast_push = ('開放 快速連推文章' in ori_screen) log.show_value( api.config, log.level.DEBUG, '快速連推文章', fast_push) if not fast_push: p = re.compile('最低間隔時間: [\d]+') r = p.search(ori_screen) if r is not None: min_interval = r.group(0)[7:].strip() min_interval = int(min_interval) else: min_interval = 0 log.show_value( api.config, log.level.DEBUG, '最低間隔時間', min_interval) else: min_interval = 0 # 推文時 自動 記錄來源 IP # 推文時 不會 記錄來源 IP push_record_ip = ('推文時 自動 記錄來源 IP' in ori_screen) log.show_value( api.config, log.level.DEBUG, '記錄來源 IP', push_record_ip) # 推文時 對齊 開頭 # 推文時 不用對齊 開頭 push_aligned = ('推文時 對齊 開頭' in ori_screen) log.show_value( api.config, log.level.DEBUG, '對齊開頭', push_aligned) # 板主 可 刪除部份違規文字 moderator_can_del_illegal_content = ( '板主 可 刪除部份違規文字' in ori_screen) log.show_value( api.config, log.level.DEBUG, '板主可刪除部份違規文字', moderator_can_del_illegal_content) # 轉錄文章 會 自動記錄,且 需要 發文權限 tran_post_auto_recorded_and_require_post_permissions = ( '轉錄文章 會 自動記錄,且 需要 發文權限' in ori_screen) log.show_value( api.config, log.level.DEBUG, '轉錄文章 會 自動記錄,且 需要 發文權限', tran_post_auto_recorded_and_require_post_permissions) cool_mode = ( '未 設為冷靜模式' not in ori_screen) log.show_value( api.config, log.level.DEBUG, '冷靜模式', cool_mode) require18 = ( '禁止 未滿十八歲進入' in ori_screen) log.show_value( api.config, log.level.DEBUG, '禁止未滿十八歲進入', require18) p = re.compile('登入次數 [\d]+ 次以上') r = p.search(ori_screen) if r is not None: require_login_time = r.group(0).split(' ')[1] require_login_time = int(require_login_time) else: require_login_time = 0 log.show_value( api.config, log.level.DEBUG, '發文限制登入次數', require_login_time) p = re.compile('退文篇數 [\d]+ 篇以下') r = p.search(ori_screen) if r is not None: require_illegal_post = r.group(0).split(' ')[1] require_illegal_post = int(require_illegal_post) else: require_illegal_post = 0 log.show_value( api.config, log.level.DEBUG, '發文限制退文篇數', require_illegal_post) kind_list = None login_limit = None retirement_limit = None if get_board_limit: api._goto_board(board) # Go certain board, then get board info cmd_list = list() cmd_list.append('I') cmd = ''.join(cmd_list) # If no certain limit return both zero target_list = [ connect_core.TargetUnit( i18n.Done, '無特別限制', break_detect=True ) ] index = api.connect_core.send( cmd, target_list) if not index == 0: # Regex to get limit ori_screen = api.connect_core.get_screen_queue()[-1] screen_lines = ori_screen.split('\n') for i in screen_lines: # regex login limit if '登入次數' in i and '次以上' in i: type_pattern = re.compile('登入次數 (\d*) 次以上') login_limit = type_pattern.findall(i) if not len(login_limit) == 0: login_limit = int(login_limit[0]) else: login_limit = 0 # regex retirement limit if '退文篇數' in i and '篇以下' in i: type_pattern = re.compile('退文篇數 (\d*) 篇以下') retirement_limit = type_pattern.findall(i) if not len(retirement_limit) == 0: retirement_limit = int(retirement_limit[0]) else: retirement_limit = 0 if not (login_limit == None or retirement_limit == None): break # Clear post status cmd_list = list() cmd_list.append(command.Ctrl_C) cmd_list.append(command.Ctrl_C) cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit( i18n.Done, screens.Target.InBoard, break_detect=True ) ] api.connect_core.send( cmd, target_list) else: login_limit = 0 retirement_limit = 0 if get_post_kind: api._goto_board(board) # Go certain board, then post to get post type info cmd_list = list() cmd_list.append(command.Ctrl_P) cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit( i18n.NoPermission, '無法發文: 未達看板要求權限', break_detect=True ), connect_core.TargetUnit( i18n.Done, '或不選)', break_detect=True ) ] index = api.connect_core.send( cmd, target_list) if index == 0: raise exceptions.NoPermission(i18n.NoPermission) # no post permission ori_screen = api.connect_core.get_screen_queue()[-1] screen_lines = ori_screen.split('\n') for i in screen_lines: if '種類:' in i: type_pattern = re.compile('\d\.([^\ ]*)') # 0 is not present any type that the key hold None object kind_list = type_pattern.findall(i) break # Clear post status cmd_list = list() cmd_list.append(command.Ctrl_C) cmd_list.append(command.Ctrl_C) cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit( i18n.Done, screens.Target.InBoard, break_detect=True ) ] api.connect_core.send( cmd, target_list) board_info = data_type.BoardInfo( boardname, online_user, chinese_des, moderators, open_status, into_top_ten_when_hide, non_board_members_post, reply_post, self_del_post, push_post, boo_post, fast_push, min_interval, push_record_ip, push_aligned, moderator_can_del_illegal_content, tran_post_auto_recorded_and_require_post_permissions, cool_mode, require18, require_login_time, require_illegal_post, kind_list, login_limit, retirement_limit) return board_info
def get_post_index( api, board: str, aid: str) -> int: api._goto_board(board) cmd_list = list() cmd_list.append('#') cmd_list.append(aid) cmd_list.append(command.Enter) cmd = ''.join(cmd_list) no_such_post = i18n.NoSuchPost no_such_post = i18n.replace(no_such_post, board, aid) target_list = [ connect_core.TargetUnit( no_such_post, '找不到這個文章代碼', log_level=log.level.DEBUG, exceptions_=exceptions.NoSuchPost(board, aid) ), # 此狀態下無法使用搜尋文章代碼(AID)功能 connect_core.TargetUnit( i18n.CanNotUseSearchPostCodeF, '此狀態下無法使用搜尋文章代碼(AID)功能', exceptions_=exceptions.CanNotUseSearchPostCode() ), connect_core.TargetUnit( i18n.NoPost, '沒有文章...', exceptions_=exceptions.NoSuchPost(board, aid) ), connect_core.TargetUnit( i18n.Success, screens.Target.InBoard, break_detect=True, log_level=log.level.DEBUG ), connect_core.TargetUnit( i18n.Success, screens.Target.InBoardWithCursor, break_detect=True, log_level=log.level.DEBUG ), connect_core.TargetUnit( i18n.NoSuchBoard, screens.Target.MainMenu_Exiting, exceptions_=exceptions.NoSuchBoard(api.config, board) # BreakDetect=True, ) ] index = api.connect_core.send( cmd, target_list ) ori_screen = api.connect_core.get_screen_queue()[-1] if index < 0: # print(OriScreen) raise exceptions.NoSuchBoard(api.config, board) # if index == 5: # print(OriScreen) # raise exceptions.NoSuchBoard(api.config, Board) # print(index) # print(OriScreen) screen_list = ori_screen.split('\n') line = [x for x in screen_list if x.startswith(api.cursor)] line = line[0] last_line = screen_list[screen_list.index(line) - 1] # print(LastLine) # print(line) if '編號' in last_line and '人氣:' in last_line: index = line[1:].strip() index_fix = False else: index = last_line.strip() index_fix = True while ' ' in index: index = index.replace(' ', ' ') index_list = index.split(' ') index = index_list[0] if index == '★': return 0 index = int(index) if index_fix: index += 1 # print(Index) return index
def get_bottom_post_list(api, board): api._goto_board(board, end=True) last_screen = api.connect_core.get_screen_queue()[-1] # print(last_screen) bottom_screen = [ line for line in last_screen.split('\n') if '★' in line[:8] ] bottom_length = len(bottom_screen) # bottom_screen = '\n'.join(bottom_screen) # print(bottom_screen) if bottom_length == 0: log.log(api.config, log.level.INFO, i18n.CatchBottomPostSuccess) return list() cmd_list = list() cmd_list.append(command.QueryPost) cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit([ i18n.CatchPost, i18n.Success, ], screens.Target.QueryPost, break_detect=True, refresh=False, log_level=log.level.DEBUG), connect_core.TargetUnit([ i18n.PostDeleted, i18n.Success, ], screens.Target.InBoard, break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit(i18n.NoSuchBoard, screens.Target.MainMenu_Exiting, exceptions_=exceptions.NoSuchBoard( api.config, board)), ] result = list() for _ in range(0, bottom_length): api.connect_core.send(cmd, target_list) last_screen = api.connect_core.get_screen_queue()[-1] lock_post, post_author, post_title, post_aid, post_web, post_money, list_date, push_number, post_index = \ _api_util.parse_query_post( api, last_screen) current_post = api.get_post(board, post_aid=post_aid, query=True) # print(current_post.aid) # print(current_post.title) # print('==========================') result.append(current_post) cmd_list = list() cmd_list.append(command.Enter) cmd_list.append(command.Up) cmd_list.append(command.QueryPost) cmd = ''.join(cmd_list) log.log(api.config, log.level.INFO, i18n.CatchBottomPostSuccess) return list(reversed(result))
def get_post(api, board: str, post_aid: str = None, post_index: int = 0, search_type: int = 0, search_condition: str = None, search_list: list = None, query: bool = False) -> data_type.PostInfo: api._goto_board(board) cmd_list = list() if post_aid is not None: cmd_list.append('#' + post_aid) elif post_index != 0: if search_condition is not None: if search_type == data_type.post_search_type.KEYWORD: cmd_list.append('/') elif search_type == data_type.post_search_type.AUTHOR: cmd_list.append('a') elif search_type == data_type.post_search_type.PUSH: cmd_list.append('Z') elif search_type == data_type.post_search_type.MARK: cmd_list.append('G') elif search_type == data_type.post_search_type.MONEY: cmd_list.append('A') cmd_list.append(search_condition) cmd_list.append(command.Enter) if search_list is not None: for search_type_, search_condition_ in search_list: if search_type_ == data_type.post_search_type.KEYWORD: cmd_list.append('/') elif search_type_ == data_type.post_search_type.AUTHOR: cmd_list.append('a') elif search_type_ == data_type.post_search_type.PUSH: cmd_list.append('Z') elif search_type_ == data_type.post_search_type.MARK: cmd_list.append('G') elif search_type_ == data_type.post_search_type.MONEY: cmd_list.append('A') cmd_list.append(search_condition_) cmd_list.append(command.Enter) cmd_list.append(str(max(1, post_index - 100))) cmd_list.append(command.Enter) cmd_list.append(str(post_index)) cmd_list.append(command.Enter) cmd_list.append(command.QueryPost) cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit([ i18n.CatchPost, i18n.Success, ], screens.Target.QueryPost, break_detect=True, refresh=False, log_level=log.level.DEBUG), connect_core.TargetUnit([ i18n.PostDeleted, i18n.Success, ], screens.Target.InBoard, break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit(i18n.NoSuchBoard, screens.Target.MainMenu_Exiting, exceptions_=exceptions.NoSuchBoard( api.config, board)), ] index = api.connect_core.send(cmd, target_list) ori_screen = api.connect_core.get_screen_queue()[-1] post_author = None post_title = None if index < 0 or index == 1: # 文章被刪除 log.log(api.config, log.level.DEBUG, i18n.PostDeleted) log.show_value(api.config, log.level.DEBUG, 'OriScreen', ori_screen) cursor_line = [ line for line in ori_screen.split('\n') if line.startswith(api.cursor) ] if len(cursor_line) != 1: raise exceptions.UnknownError(ori_screen) cursor_line = cursor_line[0] log.show_value(api.config, log.level.DEBUG, 'CursorLine', cursor_line) pattern = re.compile('[\d]+\/[\d]+') pattern_result = pattern.search(cursor_line) if pattern_result is None: list_date = None else: list_date = pattern_result.group(0) list_date = list_date[-5:] pattern = re.compile('\[[\w]+\]') pattern_result = pattern.search(cursor_line) if pattern_result is not None: post_del_status = data_type.post_delete_status.AUTHOR else: pattern = re.compile('<[\w]+>') pattern_result = pattern.search(cursor_line) post_del_status = data_type.post_delete_status.MODERATOR # > 79843 9/11 - □ (本文已被吃掉)< # > 76060 8/28 - □ (本文已被刪除) [weida7332] # print(f'O=>{CursorLine}<') if pattern_result is not None: post_author = pattern_result.group(0)[1:-1] else: post_author = None post_del_status = data_type.post_delete_status.UNKNOWN log.show_value(api.config, log.level.DEBUG, 'ListDate', list_date) log.show_value(api.config, log.level.DEBUG, 'PostAuthor', post_author) log.show_value(api.config, log.level.DEBUG, 'post_del_status', post_del_status) return data_type.PostInfo(board=board, author=post_author, list_date=list_date, delete_status=post_del_status, format_check=True) elif index == 0: lock_post, post_author, post_title, post_aid, post_web, post_money, list_date, push_number, post_index = \ _api_util.parse_query_post( api, ori_screen) if lock_post: post = data_type.PostInfo(board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, format_check=True, push_number=push_number, lock=True) return post if query: post = data_type.PostInfo(board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, format_check=True, push_number=push_number) return post origin_post, has_control_code = _api_util.get_content(api) if origin_post is None: post = data_type.PostInfo(board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, control_code=has_control_code, format_check=False, push_number=push_number, unconfirmed=api.Unconfirmed) return post # print('=' * 20) # print(origin_post) # print('=' * 20) content_start = '───────────────────────────────────────' content_end = list() content_end.append('--\n※ 發信站: 批踢踢實業坊') content_end.append('--\n※ 發信站: 批踢踢兔(ptt2.cc)') content_end.append('--\n※ 發信站: 新批踢踢(ptt2.twbbs.org.tw)') post_author_pattern_new = re.compile('作者 (.+) 看板') post_author_pattern_old = re.compile('作者 (.+)') board_pattern = re.compile('看板 (.+)') post_date = None post_content = list() ip = None location = None push_list = list() # 格式確認,亂改的我也沒辦法Q_Q origin_post_lines = origin_post.split('\n') author_line = origin_post_lines[0] if board.lower() == 'allpost': board_line = author_line[author_line.find(')') + 1:] pattern_result = board_pattern.search(board_line) if pattern_result is not None: board_temp = post_author = pattern_result.group(0) board_temp = board_temp[2:].strip() if len(board_temp) > 0: board = board_temp log.show_value(api.config, log.level.DEBUG, i18n.Board, board) pattern_result = post_author_pattern_new.search(author_line) if pattern_result is not None: post_author = pattern_result.group(0) post_author = post_author[:post_author.rfind(')') + 1] else: pattern_result = post_author_pattern_old.search(author_line) if pattern_result is None: log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Author) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post post_author = pattern_result.group(0) post_author = post_author[:post_author.rfind(')') + 1] post_author = post_author[4:].strip() log.show_value(api.config, log.level.DEBUG, i18n.Author, post_author) post_title_pattern = re.compile('標題 (.+)') title_line = origin_post_lines[1] pattern_result = post_title_pattern.search(title_line) if pattern_result is None: log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Title) post = data_type.PostInfo(board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed) return post post_title = pattern_result.group(0) post_title = post_title[4:].strip() log.show_value(api.config, log.level.DEBUG, i18n.Title, post_title) post_date_pattern = re.compile('時間 .{24}') date_line = origin_post_lines[2] pattern_result = post_date_pattern.search(date_line) if pattern_result is None: log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Date) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post post_date = pattern_result.group(0) post_date = post_date[4:].strip() log.show_value(api.config, log.level.DEBUG, i18n.Date, post_date) content_fail = True if content_start not in origin_post: # print('Type 1') content_fail = True else: post_content = origin_post post_content = post_content[post_content.find(content_start) + len(content_start) + 1:] # print('Type 2') # print(f'PostContent [{PostContent}]') for EC in content_end: # + 3 = 把 --\n 拿掉 # print(f'EC [{EC}]') if EC in post_content: content_fail = False post_content = post_content[:post_content.rfind(EC) + 3] origin_post_lines = origin_post[origin_post.find(EC):] # post_content = post_content.strip() origin_post_lines = origin_post_lines.split('\n') break if content_fail: log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Content) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post log.show_value(api.config, log.level.DEBUG, i18n.Content, post_content) info_lines = [ line for line in origin_post_lines if line.startswith('※') or line.startswith('◆') ] pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+') pattern_p2 = re.compile('[\d]+-[\d]+-[\d]+-[\d]+') for line in reversed(info_lines): log.show_value(api.config, log.level.DEBUG, 'IP Line', line) # type 1 # ※ 編輯: CodingMan (111.243.146.98 臺灣) # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 111.243.146.98 (臺灣) # type 2 # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 116.241.32.178 # ※ 編輯: kill77845 (114.136.55.237), 12/08/2018 16:47:59 # type 3 # ※ 發信站: 批踢踢實業坊(ptt.cc) # ◆ From: 211.20.78.69 # ※ 編輯: JCC 來自: 211.20.78.69 (06/20 10:22) # ※ 編輯: JCC (118.163.28.150), 12/03/2015 14:25:35 pattern_result = pattern.search(line) if pattern_result is not None: ip = pattern_result.group(0) location_temp = line[line.find(ip) + len(ip):].strip() location_temp = location_temp.replace('(', '') location_temp = location_temp[:location_temp.rfind(')')] location_temp = location_temp.strip() # print(f'=>[{LocationTemp}]') if ' ' not in location_temp and len(location_temp) > 0: location = location_temp log.show_value(api.config, log.level.DEBUG, 'Location', location) break pattern_result = pattern_p2.search(line) if pattern_result is not None: ip = pattern_result.group(0) ip = ip.replace('-', '.') # print(f'IP -> [{IP}]') break if api.config.host == data_type.host_type.PTT1: if ip is None: log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost, 'IP') post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post log.show_value(api.config, log.level.DEBUG, 'IP', ip) push_author_pattern = re.compile('[推|噓|→] [\w| ]+:') push_date_pattern = re.compile('[\d]+/[\d]+ [\d]+:[\d]+') push_ip_pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+') push_list = list() for line in origin_post_lines: if line.startswith('推'): push_type = data_type.push_type.PUSH elif line.startswith('噓 '): push_type = data_type.push_type.BOO elif line.startswith('→ '): push_type = data_type.push_type.ARROW else: continue result = push_author_pattern.search(line) if result is None: # 不符合推文格式 continue push_author = result.group(0)[2:-1].strip() log.show_value(api.config, log.level.DEBUG, [ i18n.Push, i18n.ID, ], push_author) result = push_date_pattern.search(line) if result is None: continue push_date = result.group(0) log.show_value(api.config, log.level.DEBUG, [ i18n.Push, i18n.Date, ], push_date) push_ip = None result = push_ip_pattern.search(line) if result is not None: push_ip = result.group(0) log.show_value(api.config, log.level.DEBUG, [ i18n.Push, 'IP', ], push_ip) push_content = line[line.find(push_author) + len(push_author):] # PushContent = PushContent.replace(PushDate, '') if api.config.host == data_type.host_type.PTT1: push_content = push_content[:push_content.rfind(push_date)] else: # → CodingMan:What is Ptt? 推 10/04 13:25 push_content = push_content[:push_content.rfind(push_date) - 2] if push_ip is not None: push_content = push_content.replace(push_ip, '') push_content = push_content[push_content.find(':') + 1:].strip() log.show_value(api.config, log.level.DEBUG, [ i18n.Push, i18n.Content, ], push_content) current_push = data_type.PushInfo(push_type, push_author, push_content, push_ip, push_date) push_list.append(current_push) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=True, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post