def get_post(self, post_id, group_id=None): if not self.login_check: logging.error('You should login first') return url = 'https://m.facebook.com/story.php?' + \ 'story_fbid=%s&id=1'%str(post_id) if group_id: url = 'https://m.facebook.com/groups/%s?'%str(group_id) + \ 'view=permalink&id='%str(post_id) req = self.session.get(url) soup = BeautifulSoup(req.text, 'lxml') post_content = soup.find('div', class_='z') author = post_content.find('h3').text content_lines = post_content.find('div', { 'data-ft': '{"tn":"*s"}' }).findAll('p') content = '' for content_line in content_lines: content += content_line.text content += '\n' time = post_content.find('footer').find('abbr').text post_info = data_type.PostInfo(post_id, author, content, time) if not post_content: logging.error( 'This post is not supported or you don\'t have acess authority' ) return post_info
def get_post(self, post_id, group_id=None): if not self.login_check: logging.error('You should login first') return url = 'https://mbasic.facebook.com/story.php?' + \ 'story_fbid=%s&id=1'%str(post_id) if group_id: url = 'https://mbasic.facebook.com/groups/%s?'%str(group_id) + \ 'view=permalink&id=%s'%str(post_id) req = self.session.get(url) soup = BeautifulSoup(req.text, 'lxml') post_content = soup.find('div', id='m_story_permalink_view') author = post_content.find('h3', recursive=True).text content = str(post_content.find('div', {'data-ft': '{"tn":"*s"}'})) content = content.replace('<br/> ', '\n') content = content.replace('<br/>', '\n') content = re.sub('<[^>]+> ', '', content) content = re.sub('<[^>]+>', '', content) time = post_content.find('footer').find('abbr').text post_image = post_content.find('div', {'data-ft': '{"tn":"H"}'}) images = [] link = None if post_image: for img_src in post_image.find_all('img', class_='s'): src = img_src.get('src') response = urllib.request.urlopen(src) img = np.asarray(bytearray(response.read()), dtype="uint8") img = cv2.imdecode(img, cv2.IMREAD_COLOR) images.append(img) link = post_image.find('a', id='u_0_2') if link: link = link.get('href') post_info = data_type.PostInfo(post_id, author, content, time, images, link) if not post_content: logging.error( 'This post is not supported or you don\'t have acess authority' ) return post_info
def crawl_board( self, crawl_type: int, post_handler, board: str, # BBS版本 start_index: int = 0, end_index: int = 0, start_aid: str = None, end_aid: str = None, search_type: int = 0, search_condition: str = None, query: bool = False, # 網頁版本 start_page: int = 0, end_page: int = 0) -> list: self._one_thread() self.config.log_last_value = None check_value.check(self.config, int, 'crawl_type', crawl_type, value_class=data_type.crawl_type) check_value.check(self.config, str, 'Board', board) if len(board) == 0: raise ValueError( log.merge(self.config, [i18n.Board, i18n.ErrorParameter, board])) if crawl_type == data_type.crawl_type.BBS: if not self._login_status: raise exceptions.Requirelogin(i18n.Requirelogin) check_value.check(self.config, int, 'SearchType', search_type) if search_condition is not None: check_value.check(self.config, str, 'SearchCondition', search_condition) if start_aid is not None: check_value.check(self.config, str, 'StartAID', start_aid) if end_aid is not None: check_value.check(self.config, str, 'EndAID', end_aid) if (start_aid is not None or end_aid is not None) and \ (start_index != 0 or end_index != 0): raise ValueError( log.merge( self.config, ['AID', 'Index', i18n.ErrorParameter, i18n.BothInput])) if (start_aid is not None or end_aid is not None) and \ (search_condition is not None): raise ValueError( log.merge(self.config, [ 'AID', 'SearchCondition', i18n.ErrorParameter, i18n.BothInput ])) if search_type == data_type.post_search_type.PUSH: try: S = int(search_condition) except ValueError: raise ValueError( log.merge(self.config, [ 'SearchCondition', i18n.ErrorParameter, ])) if not (-100 <= S <= 110): raise ValueError( log.merge(self.config, [ 'SearchCondition', i18n.ErrorParameter, ])) if start_index != 0: newest_index = self._get_newest_index( data_type.index_type.BBS, board=board, search_type=search_type, search_condition=search_condition) check_value.check_index_range(self.config, 'start_index', start_index, 'end_index', end_index, max_value=newest_index) elif start_aid is not None and end_aid is not None: start_index = self.get_post(board, post_aid=start_aid, query=True).index end_index = self.get_post(board, post_aid=end_aid, query=True).index check_value.check_index_range(self.config, 'start_index', start_index, 'end_index', end_index) else: raise ValueError( log.merge(self.config, [i18n.ErrorParameter, i18n.NoInput])) log.show_value(self.config, log.level.DEBUG, 'StartIndex', start_index) log.show_value(self.config, log.level.DEBUG, 'EndIndex', end_index) error_post_list = [] del_post_list = [] if self.config.log_level == log.level.INFO: PB = progressbar.ProgressBar(max_value=end_index - start_index + 1, redirect_stdout=True) for index in range(start_index, end_index + 1): for i in range(2): need_continue = False post = None try: post = self._get_post( board, post_index=index, search_type=search_type, search_condition=search_condition, query=query) except exceptions.ParseError as e: if i == 1: raise e need_continue = True except exceptions.UnknownError as e: if i == 1: raise e need_continue = True except exceptions.NoSuchBoard as e: if i == 1: raise e need_continue = True except exceptions.NoMatchTargetError as e: if i == 1: raise e need_continue = True except exceptions.ConnectionClosed as e: if i == 1: raise e log.log(self.config, log.level.INFO, i18n.RestoreConnection) self._login(self._ID, self._Password, self.config.kick_other_login) need_continue = True except exceptions.UseTooManyResources as e: if i == 1: raise e log.log(self.config, log.level.INFO, i18n.RestoreConnection) self._login(self._ID, self._Password, self.config.kick_other_login) need_continue = True if post is None: need_continue = True elif not post.pass_format_check: need_continue = True if need_continue: log.log(self.config, log.level.DEBUG, 'Wait for retry repost') time.sleep(0.1) continue break if self.config.log_level == log.level.INFO: PB.update(index - start_index) if post is None: error_post_list.append(index) continue if not post.pass_format_check: if post.aid is not None: error_post_list.append(post.aid) else: error_post_list.append(index) continue if post.delete_status != data_type.post_delete_status.NOT_DELETED: del_post_list.append(index) post_handler(post) if self.config.log_level == log.level.INFO: PB.finish() return error_post_list, del_post_list else: if self.config.host == data_type.host_type.PTT2: raise exceptions.HostNotSupport( lib_util.get_current_func_name()) # 網頁版本爬蟲 # https://www.ptt.cc/bbs/index.html # 1. 取得總共有幾頁 MaxPage newest_index = self._get_newest_index(data_type.index_type.WEB, board=board) # 2. 檢查 StartPage 跟 EndPage 有沒有在 1 ~ MaxPage 之間 check_value.check_index_range(self.config, 'StartPage', start_page, 'EndPage', end_page, max_value=newest_index) # 3. 把每篇文章(包括被刪除文章)欄位解析出來組合成 data_type.PostInfo error_post_list = [] del_post_list = [] # PostAID = "" _url = 'https://www.ptt.cc/bbs/' index = str(newest_index) if self.config.log_level == log.level.INFO: PB = progressbar.ProgressBar(max_value=end_page - start_page + 1, redirect_stdout=True) def deleted_post(post_title): if post_title.startswith('('): if '本文' in post_title: return data_type.post_delete_status.AUTHOR elif post_title.startswith('(已被'): return data_type.post_delete_status.MODERATOR else: return data_type.post_delete_status.UNKNOWN else: return data_type.post_delete_status.NOT_DELETED for index in range(start_page, newest_index + 1): log.show_value(self.config, log.level.DEBUG, 'CurrentPage', index) url = _url + board + '/index' + str(index) + '.html' r = requests.get(url, cookies={'over18': '1'}) if r.status_code != requests.codes.ok: raise exceptions.NoSuchBoard(self.config, board) soup = BeautifulSoup(r.text, 'html.parser') for div in soup.select('div.r-ent'): web = div.select('div.title a') post = { 'author': div.select('div.author')[0].text, 'title': div.select('div.title')[0].text.strip('\n').strip(), 'web': web[0].get('href') if web else '' } if post['title'].startswith('('): del_post_list.append(post['title']) if post['title'].startswith('(本文'): if '[' in post['title']: post['author'] = post['title'].split( '[')[1].split(']')[0] else: post['author'] = post['title'].split( '<')[1].split('>')[0] else: post['author'] = post['title'].split('<')[1].split( '>')[0] post = data_type.PostInfo( board=board, author=post['author'], title=post['title'], web_url='https://www.ptt.cc' + post['web'], delete_status=deleted_post(post['title'])) post_handler(post) if self.config.log_level == log.level.INFO: PB.update(index - start_page) log.show_value(self.config, log.level.DEBUG, 'DelPostList', del_post_list) # 4. 把組合出來的 Post 塞給 handler # 5. 顯示 progress bar if self.config.log_level == log.level.INFO: PB.finish() return error_post_list, del_post_list
def get_post( api, board: str, post_aid: str = None, post_index: int = 0, search_type: int = 0, search_condition: str = None, query: bool = False) -> data_type.PostInfo: cmd_list = [] cmd_list.append(command.GoMainMenu) cmd_list.append('qs') cmd_list.append(board) cmd_list.append(command.Enter) cmd_list.append(command.Ctrl_C * 2) cmd_list.append(command.Space) if post_aid is not None: cmd_list.append('#' + post_aid) elif post_index != 0: if search_condition is not None: if search_type == data_type.post_search_type.KEYWORD: cmd_list.append('/') elif search_type == data_type.post_search_type.AUTHOR: cmd_list.append('a') elif search_type == data_type.post_search_type.PUSH: cmd_list.append('Z') elif search_type == data_type.post_search_type.MARK: cmd_list.append('G') elif search_type == data_type.post_search_type.MONEY: cmd_list.append('A') cmd_list.append(search_condition) cmd_list.append(command.Enter) cmd_list.append(str(post_index)) cmd_list.append(command.Enter) cmd_list.append(command.QueryPost) cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit( [ i18n.CatchPost, i18n.Success, ], screens.Target.QueryPost, break_detect=True, refresh=False, log_level=log.level.DEBUG ), connect_core.TargetUnit( [ i18n.PostDeleted, i18n.Success, ], screens.Target.InBoard, break_detect=True, log_level=log.level.DEBUG ), connect_core.TargetUnit( i18n.NoSuchBoard, screens.Target.MainMenu_Exiting, exceptions_=exceptions.NoSuchBoard(api.config, board) ), ] index = api.connect_core.send(cmd, target_list) ori_screen = api.connect_core.get_screen_queue()[-1] post_author = None post_title = None if index < 0 or index == 1: # 文章被刪除 log.log(api.config, log.level.DEBUG, i18n.PostDeleted) log.show_value( api.config, log.level.DEBUG, 'OriScreen', ori_screen ) cursor_line = [line for line in ori_screen.split( '\n') if line.startswith(api.cursor)] if len(cursor_line) != 1: raise exceptions.UnknownError(ori_screen) cursor_line = cursor_line[0] log.show_value( api.config, log.level.DEBUG, 'CursorLine', cursor_line ) pattern = re.compile('[\d]+\/[\d]+') pattern_result = pattern.search(cursor_line) if pattern_result is None: list_date = None else: list_date = pattern_result.group(0) list_date = list_date[-5:] pattern = re.compile('\[[\w]+\]') pattern_result = pattern.search(cursor_line) if pattern_result is not None: post_del_status = data_type.post_delete_status.AUTHOR else: pattern = re.compile('<[\w]+>') pattern_result = pattern.search(cursor_line) post_del_status = data_type.post_delete_status.MODERATOR # > 79843 9/11 - □ (本文已被吃掉)< # > 76060 8/28 - □ (本文已被刪除) [weida7332] # print(f'O=>{CursorLine}<') if pattern_result is not None: post_author = pattern_result.group(0)[1:-1] else: post_author = None post_del_status = data_type.post_delete_status.UNKNOWN log.show_value(api.config, log.level.DEBUG, 'ListDate', list_date) log.show_value(api.config, log.level.DEBUG, 'PostAuthor', post_author) log.show_value(api.config, log.level.DEBUG, 'post_del_status', post_del_status) return data_type.PostInfo( board=board, author=post_author, list_date=list_date, delete_status=post_del_status, format_check=True ) elif index == 0: lock_post = False try: cursor_line = [line for line in ori_screen.split( '\n') if line.strip().startswith(api.cursor)][0] except Exception as e: print(api.cursor) print(ori_screen) raise e post_author = cursor_line if '□' in post_author: post_author = post_author[:post_author.find('□')].strip() elif 'R:' in post_author: post_author = post_author[:post_author.find('R:')].strip() elif ' 轉 ' in post_author: post_author = post_author[:post_author.find('轉')].strip() elif ' 鎖 ' in post_author: post_author = post_author[:post_author.find('鎖')].strip() lock_post = True post_author = post_author[post_author.rfind(' '):].strip() post_title = cursor_line if ' □ ' in post_title: post_title = post_title[post_title.find('□') + 1:].strip() elif ' R:' in post_title: post_title = post_title[post_title.find('R:'):].strip() elif ' 轉 ' in post_title: # print(f'[{PostTitle}]=========>') post_title = post_title[post_title.find('轉') + 1:].strip() post_title = f'Fw: {post_title}' # print(f'=========>[{PostTitle}]') elif ' 鎖 ' in post_title: post_title = post_title[post_title.find('鎖') + 1:].strip() ori_screen_temp = ori_screen[ori_screen.find('┌──────────'):] ori_screen_temp = ori_screen_temp[:ori_screen_temp.find( '└─────────────') ] aid_line = [line for line in ori_screen.split( '\n') if line.startswith('│ 文章代碼(AID)')] if len(aid_line) == 1: aid_line = aid_line[0] pattern = re.compile('#[\w|-]+') pattern_result = pattern.search(aid_line) post_aid = pattern_result.group(0)[1:] pattern = re.compile('文章網址: https:[\S]+html') pattern_result = pattern.search(ori_screen_temp) if pattern_result is None: post_web = None else: post_web = pattern_result.group(0)[6:] pattern = re.compile('這一篇文章值 [\d]+ Ptt幣') pattern_result = pattern.search(ori_screen_temp) if pattern_result is None: # 特殊文章無價格 post_money = -1 else: post_money = pattern_result.group(0)[7:] post_money = post_money[:post_money.find(' ')] post_money = int(post_money) pattern = re.compile('[\d]+\/[\d]+') pattern_result = pattern.search(cursor_line) if pattern_result is None: list_date = None else: list_date = pattern_result.group(0) list_date = list_date[-5:] # print(list_date) # > 7485 9 8/09 CodingMan □ [閒聊] PTT Library 更新 # > 79189 M 1 9/17 LittleCalf □ [公告] 禁言退文公告 # >781508 +爆 9/17 jodojeda □ [新聞] 國人吃魚少 學者:應把吃魚當成輕鬆愉快 # >781406 +X1 9/17 kingofage111 R: [申請] ReDmango 請辭Gossiping板主職務 if post_index == 0: pattern = re.compile('[\d]+') pattern_result = pattern.search(cursor_line) if pattern_result is not None: post_index = int(pattern_result.group(0)) push_number = cursor_line # print(f'2>{push_number}<') push_number = push_number[7:11] # print(PushNumber) push_number = push_number.split(' ') # print(PushNumber) push_number = list(filter(None, push_number)) # print(PushNumber) if len(push_number) == 0: push_number = None else: push_number = push_number[-1] # print(PushNumber) if push_number.startswith('+') or push_number.startswith('~'): push_number = push_number[1:] # print(PushNumber) if push_number.lower().startswith('m'): push_number = push_number[1:] # print(PushNumber) if push_number.lower().startswith('!'): push_number = push_number[1:] if push_number.lower().startswith('s'): push_number = push_number[1:] if push_number.lower().startswith('='): push_number = push_number[1:] if len(push_number) == 0: push_number = None # print(PushNumber) log.show_value(api.config, log.level.DEBUG, 'PostAuthor', post_author) log.show_value(api.config, log.level.DEBUG, 'PostTitle', post_title) log.show_value(api.config, log.level.DEBUG, 'PostAID', post_aid) log.show_value(api.config, log.level.DEBUG, 'PostWeb', post_web) log.show_value(api.config, log.level.DEBUG, 'PostMoney', post_money) log.show_value(api.config, log.level.DEBUG, 'ListDate', list_date) log.show_value(api.config, log.level.DEBUG, 'PushNumber', push_number) if lock_post: post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, format_check=True, push_number=push_number, lock=True, ) return post if query: post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, format_check=True, push_number=push_number, ) return post origin_post, has_control_code = _api_util.get_content(api) if origin_post is None: post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, control_code=has_control_code, format_check=False, push_number=push_number, unconfirmed=api.Unconfirmed, ) return post # print('=' * 20) # print() # print('=' * 20) content_start = '───────────────────────────────────────' content_end = [] content_end.append('--\n※ 發信站: 批踢踢實業坊(ptt.cc)') content_end.append('--\n※ 發信站: 批踢踢兔(ptt2.cc)') content_end.append('--\n※ 發信站: 新批踢踢(ptt2.twbbs.org.tw)') post_author_pattern_new = re.compile('作者 (.+) 看板') post_author_pattern_old = re.compile('作者 (.+)') board_pattern = re.compile('看板 (.+)') post_date = None post_content = [] ip = None location = None push_list = [] # 格式確認,亂改的我也沒辦法Q_Q origin_post_lines = origin_post.split('\n') author_line = origin_post_lines[0] if board.lower() == 'allpost': board_line = author_line[author_line.find(')') + 1:] pattern_result = board_pattern.search(board_line) if pattern_result is not None: board_temp = post_author = pattern_result.group(0) board_temp = board_temp[2:].strip() if len(board_temp) > 0: board = board_temp log.show_value( api.config, log.level.DEBUG, i18n.Board, board ) pattern_result = post_author_pattern_new.search(author_line) if pattern_result is not None: post_author = pattern_result.group(0) post_author = post_author[:post_author.rfind(')') + 1] else: pattern_result = post_author_pattern_old.search(author_line) if pattern_result is None: log.show_value( api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Author ) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post post_author = pattern_result.group(0) post_author = post_author[:post_author.rfind(')') + 1] post_author = post_author[4:].strip() log.show_value( api.config, log.level.DEBUG, i18n.Author, post_author ) post_title_pattern = re.compile('標題 (.+)') title_line = origin_post_lines[1] pattern_result = post_title_pattern.search(title_line) if pattern_result is None: log.show_value( api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Title ) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post post_title = pattern_result.group(0) post_title = post_title[4:].strip() log.show_value( api.config, log.level.DEBUG, i18n.Title, post_title ) post_date_pattern = re.compile('時間 (.+)') date_line = origin_post_lines[2] pattern_result = post_date_pattern.search(date_line) if pattern_result is None: log.show_value( api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Date ) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post post_date = pattern_result.group(0) post_date = post_date[4:].strip() log.show_value( api.config, log.level.DEBUG, i18n.Date, post_date ) content_fail = True if content_start not in origin_post: # print('Type 1') content_fail = True else: post_content = origin_post post_content = post_content[ post_content.find(content_start) + len(content_start) + 1: ] # print('Type 2') # print(f'PostContent [{PostContent}]') for EC in content_end: # + 3 = 把 --\n 拿掉 # print(f'EC [{EC}]') if EC in post_content: content_fail = False post_content = post_content[ :post_content.rfind(EC) + 3 ] origin_post_lines = origin_post[origin_post.find(EC):] # post_content = post_content.strip() origin_post_lines = origin_post_lines.split('\n') break if content_fail: log.show_value( api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Content ) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post log.show_value( api.config, log.level.DEBUG, i18n.Content, post_content ) info_lines = [ line for line in origin_post_lines if line.startswith('※') or line.startswith('◆') ] pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+') pattern_p2 = re.compile('[\d]+-[\d]+-[\d]+-[\d]+') for line in reversed(info_lines): log.show_value( api.config, log.level.DEBUG, 'IP Line', line ) # type 1 # ※ 編輯: CodingMan (111.243.146.98 臺灣) # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 111.243.146.98 (臺灣) # type 2 # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 116.241.32.178 # ※ 編輯: kill77845 (114.136.55.237), 12/08/2018 16:47:59 # type 3 # ※ 發信站: 批踢踢實業坊(ptt.cc) # ◆ From: 211.20.78.69 # ※ 編輯: JCC 來自: 211.20.78.69 (06/20 10:22) # ※ 編輯: JCC (118.163.28.150), 12/03/2015 14:25:35 pattern_result = pattern.search(line) if pattern_result is not None: ip = pattern_result.group(0) location_temp = line[line.find(ip) + len(ip):].strip() location_temp = location_temp.replace('(', '') location_temp = location_temp[:location_temp.rfind(')')] location_temp = location_temp.strip() # print(f'=>[{LocationTemp}]') if ' ' not in location_temp and len(location_temp) > 0: location = location_temp log.show_value(api.config, log.level.DEBUG, 'Location', location) break pattern_result = pattern_p2.search(line) if pattern_result is not None: ip = pattern_result.group(0) ip = ip.replace('-', '.') # print(f'IP -> [{IP}]') break if api.config.host == data_type.host_type.PTT1: if ip is None: log.show_value( api.config, log.level.DEBUG, i18n.SubstandardPost, 'IP' ) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post log.show_value(api.config, log.level.DEBUG, 'IP', ip) push_author_pattern = re.compile('[推|噓|→] [\w| ]+:') push_date_pattern = re.compile('[\d]+/[\d]+ [\d]+:[\d]+') push_ip_pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+') push_list = [] for line in origin_post_lines: if line.startswith('推'): push_type = data_type.push_type.PUSH elif line.startswith('噓 '): push_type = data_type.push_type.BOO elif line.startswith('→ '): push_type = data_type.push_type.ARROW else: continue result = push_author_pattern.search(line) if result is None: # 不符合推文格式 continue push_author = result.group(0)[2:-1].strip() log.show_value(api.config, log.level.DEBUG, [ i18n.Push, i18n.ID, ], push_author ) result = push_date_pattern.search(line) if result is None: continue push_date = result.group(0) log.show_value(api.config, log.level.DEBUG, [ i18n.Push, i18n.Date, ], push_date ) push_ip = None result = push_ip_pattern.search(line) if result is not None: push_ip = result.group(0) log.show_value( api.config, log.level.DEBUG, [ i18n.Push, 'IP', ], push_ip ) push_content = line[ line.find(push_author) + len(push_author): ] # PushContent = PushContent.replace(PushDate, '') if api.config.host == data_type.host_type.PTT1: push_content = push_content[ :push_content.rfind(push_date) ] else: # → CodingMan:What is Ptt? 推 10/04 13:25 push_content = push_content[ :push_content.rfind(push_date) - 2 ] if push_ip is not None: push_content = push_content.replace(push_ip, '') push_content = push_content[ push_content.find(':') + 1: ].strip() log.show_value( api.config, log.level.DEBUG, [ i18n.Push, i18n.Content, ], push_content ) current_push = data_type.PushInfo( push_type, push_author, push_content, push_ip, push_date ) push_list.append(current_push) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=True, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post
def get_post(api, board: str, post_aid: str = None, post_index: int = 0, search_type: int = 0, search_condition: str = None, search_list: list = None, query: bool = False) -> data_type.PostInfo: api._goto_board(board) cmd_list = list() if post_aid is not None: cmd_list.append('#' + post_aid) elif post_index != 0: if search_condition is not None: if search_type == data_type.post_search_type.KEYWORD: cmd_list.append('/') elif search_type == data_type.post_search_type.AUTHOR: cmd_list.append('a') elif search_type == data_type.post_search_type.PUSH: cmd_list.append('Z') elif search_type == data_type.post_search_type.MARK: cmd_list.append('G') elif search_type == data_type.post_search_type.MONEY: cmd_list.append('A') cmd_list.append(search_condition) cmd_list.append(command.Enter) if search_list is not None: for search_type_, search_condition_ in search_list: if search_type_ == data_type.post_search_type.KEYWORD: cmd_list.append('/') elif search_type_ == data_type.post_search_type.AUTHOR: cmd_list.append('a') elif search_type_ == data_type.post_search_type.PUSH: cmd_list.append('Z') elif search_type_ == data_type.post_search_type.MARK: cmd_list.append('G') elif search_type_ == data_type.post_search_type.MONEY: cmd_list.append('A') cmd_list.append(search_condition_) cmd_list.append(command.Enter) cmd_list.append(str(max(1, post_index - 100))) cmd_list.append(command.Enter) cmd_list.append(str(post_index)) cmd_list.append(command.Enter) cmd_list.append(command.QueryPost) cmd = ''.join(cmd_list) target_list = [ connect_core.TargetUnit([ i18n.CatchPost, i18n.Success, ], screens.Target.QueryPost, break_detect=True, refresh=False, log_level=log.level.DEBUG), connect_core.TargetUnit([ i18n.PostDeleted, i18n.Success, ], screens.Target.InBoard, break_detect=True, log_level=log.level.DEBUG), connect_core.TargetUnit(i18n.NoSuchBoard, screens.Target.MainMenu_Exiting, exceptions_=exceptions.NoSuchBoard( api.config, board)), ] index = api.connect_core.send(cmd, target_list) ori_screen = api.connect_core.get_screen_queue()[-1] post_author = None post_title = None if index < 0 or index == 1: # 文章被刪除 log.log(api.config, log.level.DEBUG, i18n.PostDeleted) log.show_value(api.config, log.level.DEBUG, 'OriScreen', ori_screen) cursor_line = [ line for line in ori_screen.split('\n') if line.startswith(api.cursor) ] if len(cursor_line) != 1: raise exceptions.UnknownError(ori_screen) cursor_line = cursor_line[0] log.show_value(api.config, log.level.DEBUG, 'CursorLine', cursor_line) pattern = re.compile('[\d]+\/[\d]+') pattern_result = pattern.search(cursor_line) if pattern_result is None: list_date = None else: list_date = pattern_result.group(0) list_date = list_date[-5:] pattern = re.compile('\[[\w]+\]') pattern_result = pattern.search(cursor_line) if pattern_result is not None: post_del_status = data_type.post_delete_status.AUTHOR else: pattern = re.compile('<[\w]+>') pattern_result = pattern.search(cursor_line) post_del_status = data_type.post_delete_status.MODERATOR # > 79843 9/11 - □ (本文已被吃掉)< # > 76060 8/28 - □ (本文已被刪除) [weida7332] # print(f'O=>{CursorLine}<') if pattern_result is not None: post_author = pattern_result.group(0)[1:-1] else: post_author = None post_del_status = data_type.post_delete_status.UNKNOWN log.show_value(api.config, log.level.DEBUG, 'ListDate', list_date) log.show_value(api.config, log.level.DEBUG, 'PostAuthor', post_author) log.show_value(api.config, log.level.DEBUG, 'post_del_status', post_del_status) return data_type.PostInfo(board=board, author=post_author, list_date=list_date, delete_status=post_del_status, format_check=True) elif index == 0: lock_post, post_author, post_title, post_aid, post_web, post_money, list_date, push_number, post_index = \ _api_util.parse_query_post( api, ori_screen) if lock_post: post = data_type.PostInfo(board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, format_check=True, push_number=push_number, lock=True) return post if query: post = data_type.PostInfo(board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, format_check=True, push_number=push_number) return post origin_post, has_control_code = _api_util.get_content(api) if origin_post is None: post = data_type.PostInfo(board=board, aid=post_aid, index=post_index, author=post_author, title=post_title, web_url=post_web, money=post_money, list_date=list_date, control_code=has_control_code, format_check=False, push_number=push_number, unconfirmed=api.Unconfirmed) return post # print('=' * 20) # print(origin_post) # print('=' * 20) content_start = '───────────────────────────────────────' content_end = list() content_end.append('--\n※ 發信站: 批踢踢實業坊') content_end.append('--\n※ 發信站: 批踢踢兔(ptt2.cc)') content_end.append('--\n※ 發信站: 新批踢踢(ptt2.twbbs.org.tw)') post_author_pattern_new = re.compile('作者 (.+) 看板') post_author_pattern_old = re.compile('作者 (.+)') board_pattern = re.compile('看板 (.+)') post_date = None post_content = list() ip = None location = None push_list = list() # 格式確認,亂改的我也沒辦法Q_Q origin_post_lines = origin_post.split('\n') author_line = origin_post_lines[0] if board.lower() == 'allpost': board_line = author_line[author_line.find(')') + 1:] pattern_result = board_pattern.search(board_line) if pattern_result is not None: board_temp = post_author = pattern_result.group(0) board_temp = board_temp[2:].strip() if len(board_temp) > 0: board = board_temp log.show_value(api.config, log.level.DEBUG, i18n.Board, board) pattern_result = post_author_pattern_new.search(author_line) if pattern_result is not None: post_author = pattern_result.group(0) post_author = post_author[:post_author.rfind(')') + 1] else: pattern_result = post_author_pattern_old.search(author_line) if pattern_result is None: log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Author) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post post_author = pattern_result.group(0) post_author = post_author[:post_author.rfind(')') + 1] post_author = post_author[4:].strip() log.show_value(api.config, log.level.DEBUG, i18n.Author, post_author) post_title_pattern = re.compile('標題 (.+)') title_line = origin_post_lines[1] pattern_result = post_title_pattern.search(title_line) if pattern_result is None: log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Title) post = data_type.PostInfo(board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed) return post post_title = pattern_result.group(0) post_title = post_title[4:].strip() log.show_value(api.config, log.level.DEBUG, i18n.Title, post_title) post_date_pattern = re.compile('時間 .{24}') date_line = origin_post_lines[2] pattern_result = post_date_pattern.search(date_line) if pattern_result is None: log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Date) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post post_date = pattern_result.group(0) post_date = post_date[4:].strip() log.show_value(api.config, log.level.DEBUG, i18n.Date, post_date) content_fail = True if content_start not in origin_post: # print('Type 1') content_fail = True else: post_content = origin_post post_content = post_content[post_content.find(content_start) + len(content_start) + 1:] # print('Type 2') # print(f'PostContent [{PostContent}]') for EC in content_end: # + 3 = 把 --\n 拿掉 # print(f'EC [{EC}]') if EC in post_content: content_fail = False post_content = post_content[:post_content.rfind(EC) + 3] origin_post_lines = origin_post[origin_post.find(EC):] # post_content = post_content.strip() origin_post_lines = origin_post_lines.split('\n') break if content_fail: log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost, i18n.Content) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post log.show_value(api.config, log.level.DEBUG, i18n.Content, post_content) info_lines = [ line for line in origin_post_lines if line.startswith('※') or line.startswith('◆') ] pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+') pattern_p2 = re.compile('[\d]+-[\d]+-[\d]+-[\d]+') for line in reversed(info_lines): log.show_value(api.config, log.level.DEBUG, 'IP Line', line) # type 1 # ※ 編輯: CodingMan (111.243.146.98 臺灣) # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 111.243.146.98 (臺灣) # type 2 # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 116.241.32.178 # ※ 編輯: kill77845 (114.136.55.237), 12/08/2018 16:47:59 # type 3 # ※ 發信站: 批踢踢實業坊(ptt.cc) # ◆ From: 211.20.78.69 # ※ 編輯: JCC 來自: 211.20.78.69 (06/20 10:22) # ※ 編輯: JCC (118.163.28.150), 12/03/2015 14:25:35 pattern_result = pattern.search(line) if pattern_result is not None: ip = pattern_result.group(0) location_temp = line[line.find(ip) + len(ip):].strip() location_temp = location_temp.replace('(', '') location_temp = location_temp[:location_temp.rfind(')')] location_temp = location_temp.strip() # print(f'=>[{LocationTemp}]') if ' ' not in location_temp and len(location_temp) > 0: location = location_temp log.show_value(api.config, log.level.DEBUG, 'Location', location) break pattern_result = pattern_p2.search(line) if pattern_result is not None: ip = pattern_result.group(0) ip = ip.replace('-', '.') # print(f'IP -> [{IP}]') break if api.config.host == data_type.host_type.PTT1: if ip is None: log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost, 'IP') post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=False, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post log.show_value(api.config, log.level.DEBUG, 'IP', ip) push_author_pattern = re.compile('[推|噓|→] [\w| ]+:') push_date_pattern = re.compile('[\d]+/[\d]+ [\d]+:[\d]+') push_ip_pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+') push_list = list() for line in origin_post_lines: if line.startswith('推'): push_type = data_type.push_type.PUSH elif line.startswith('噓 '): push_type = data_type.push_type.BOO elif line.startswith('→ '): push_type = data_type.push_type.ARROW else: continue result = push_author_pattern.search(line) if result is None: # 不符合推文格式 continue push_author = result.group(0)[2:-1].strip() log.show_value(api.config, log.level.DEBUG, [ i18n.Push, i18n.ID, ], push_author) result = push_date_pattern.search(line) if result is None: continue push_date = result.group(0) log.show_value(api.config, log.level.DEBUG, [ i18n.Push, i18n.Date, ], push_date) push_ip = None result = push_ip_pattern.search(line) if result is not None: push_ip = result.group(0) log.show_value(api.config, log.level.DEBUG, [ i18n.Push, 'IP', ], push_ip) push_content = line[line.find(push_author) + len(push_author):] # PushContent = PushContent.replace(PushDate, '') if api.config.host == data_type.host_type.PTT1: push_content = push_content[:push_content.rfind(push_date)] else: # → CodingMan:What is Ptt? 推 10/04 13:25 push_content = push_content[:push_content.rfind(push_date) - 2] if push_ip is not None: push_content = push_content.replace(push_ip, '') push_content = push_content[push_content.find(':') + 1:].strip() log.show_value(api.config, log.level.DEBUG, [ i18n.Push, i18n.Content, ], push_content) current_push = data_type.PushInfo(push_type, push_author, push_content, push_ip, push_date) push_list.append(current_push) post = data_type.PostInfo( board=board, aid=post_aid, index=post_index, author=post_author, date=post_date, title=post_title, web_url=post_web, money=post_money, content=post_content, ip=ip, push_list=push_list, list_date=list_date, control_code=has_control_code, format_check=True, location=location, push_number=push_number, origin_post=origin_post, unconfirmed=api.Unconfirmed, ) return post