예제 #1
0
 def get_post(self, post_id, group_id=None):
     if not self.login_check:
         logging.error('You should login first')
         return
     url = 'https://m.facebook.com/story.php?' + \
           'story_fbid=%s&id=1'%str(post_id)
     if group_id:
         url = 'https://m.facebook.com/groups/%s?'%str(group_id) + \
               'view=permalink&id='%str(post_id)
     req = self.session.get(url)
     soup = BeautifulSoup(req.text, 'lxml')
     post_content = soup.find('div', class_='z')
     author = post_content.find('h3').text
     content_lines = post_content.find('div', {
         'data-ft': '{"tn":"*s"}'
     }).findAll('p')
     content = ''
     for content_line in content_lines:
         content += content_line.text
         content += '\n'
     time = post_content.find('footer').find('abbr').text
     post_info = data_type.PostInfo(post_id, author, content, time)
     if not post_content:
         logging.error(
             'This post is not supported or you don\'t have acess authority'
         )
     return post_info
예제 #2
0
 def get_post(self, post_id, group_id=None):
     if not self.login_check:
         logging.error('You should login first')
         return
     url = 'https://mbasic.facebook.com/story.php?' + \
           'story_fbid=%s&id=1'%str(post_id)
     if group_id:
         url = 'https://mbasic.facebook.com/groups/%s?'%str(group_id) + \
               'view=permalink&id=%s'%str(post_id)
     req = self.session.get(url)
     soup = BeautifulSoup(req.text, 'lxml')
     post_content = soup.find('div', id='m_story_permalink_view')
     author = post_content.find('h3', recursive=True).text
     content = str(post_content.find('div', {'data-ft': '{"tn":"*s"}'}))
     content = content.replace('<br/> ', '\n')
     content = content.replace('<br/>', '\n')
     content = re.sub('<[^>]+> ', '', content)
     content = re.sub('<[^>]+>', '', content)
     time = post_content.find('footer').find('abbr').text
     post_image = post_content.find('div', {'data-ft': '{"tn":"H"}'})
     images = []
     link = None
     if post_image:
         for img_src in post_image.find_all('img', class_='s'):
             src = img_src.get('src')
             response = urllib.request.urlopen(src)
             img = np.asarray(bytearray(response.read()), dtype="uint8")
             img = cv2.imdecode(img, cv2.IMREAD_COLOR)
             images.append(img)
         link = post_image.find('a', id='u_0_2')
         if link:
             link = link.get('href')
     post_info = data_type.PostInfo(post_id, author, content, time, images,
                                    link)
     if not post_content:
         logging.error(
             'This post is not supported or you don\'t have acess authority'
         )
     return post_info
예제 #3
0
파일: PTT.py 프로젝트: AdamChangPlus/PyPtt
    def crawl_board(
            self,
            crawl_type: int,
            post_handler,
            board: str,
            # BBS版本
            start_index: int = 0,
            end_index: int = 0,
            start_aid: str = None,
            end_aid: str = None,
            search_type: int = 0,
            search_condition: str = None,
            query: bool = False,
            # 網頁版本
            start_page: int = 0,
            end_page: int = 0) -> list:

        self._one_thread()

        self.config.log_last_value = None

        check_value.check(self.config,
                          int,
                          'crawl_type',
                          crawl_type,
                          value_class=data_type.crawl_type)
        check_value.check(self.config, str, 'Board', board)

        if len(board) == 0:
            raise ValueError(
                log.merge(self.config,
                          [i18n.Board, i18n.ErrorParameter, board]))

        if crawl_type == data_type.crawl_type.BBS:
            if not self._login_status:
                raise exceptions.Requirelogin(i18n.Requirelogin)

            check_value.check(self.config, int, 'SearchType', search_type)
            if search_condition is not None:
                check_value.check(self.config, str, 'SearchCondition',
                                  search_condition)
            if start_aid is not None:
                check_value.check(self.config, str, 'StartAID', start_aid)
            if end_aid is not None:
                check_value.check(self.config, str, 'EndAID', end_aid)

            if (start_aid is not None or end_aid is not None) and \
                    (start_index != 0 or end_index != 0):
                raise ValueError(
                    log.merge(
                        self.config,
                        ['AID', 'Index', i18n.ErrorParameter, i18n.BothInput]))

            if (start_aid is not None or end_aid is not None) and \
                    (search_condition is not None):
                raise ValueError(
                    log.merge(self.config, [
                        'AID', 'SearchCondition', i18n.ErrorParameter,
                        i18n.BothInput
                    ]))

            if search_type == data_type.post_search_type.PUSH:
                try:
                    S = int(search_condition)
                except ValueError:
                    raise ValueError(
                        log.merge(self.config, [
                            'SearchCondition',
                            i18n.ErrorParameter,
                        ]))

                if not (-100 <= S <= 110):
                    raise ValueError(
                        log.merge(self.config, [
                            'SearchCondition',
                            i18n.ErrorParameter,
                        ]))

            if start_index != 0:
                newest_index = self._get_newest_index(
                    data_type.index_type.BBS,
                    board=board,
                    search_type=search_type,
                    search_condition=search_condition)

                check_value.check_index_range(self.config,
                                              'start_index',
                                              start_index,
                                              'end_index',
                                              end_index,
                                              max_value=newest_index)
            elif start_aid is not None and end_aid is not None:
                start_index = self.get_post(board,
                                            post_aid=start_aid,
                                            query=True).index
                end_index = self.get_post(board, post_aid=end_aid,
                                          query=True).index

                check_value.check_index_range(self.config, 'start_index',
                                              start_index, 'end_index',
                                              end_index)
            else:
                raise ValueError(
                    log.merge(self.config,
                              [i18n.ErrorParameter, i18n.NoInput]))

            log.show_value(self.config, log.level.DEBUG, 'StartIndex',
                           start_index)

            log.show_value(self.config, log.level.DEBUG, 'EndIndex', end_index)

            error_post_list = []
            del_post_list = []
            if self.config.log_level == log.level.INFO:
                PB = progressbar.ProgressBar(max_value=end_index -
                                             start_index + 1,
                                             redirect_stdout=True)
            for index in range(start_index, end_index + 1):

                for i in range(2):
                    need_continue = False
                    post = None
                    try:
                        post = self._get_post(
                            board,
                            post_index=index,
                            search_type=search_type,
                            search_condition=search_condition,
                            query=query)
                    except exceptions.ParseError as e:
                        if i == 1:
                            raise e
                        need_continue = True
                    except exceptions.UnknownError as e:
                        if i == 1:
                            raise e
                        need_continue = True
                    except exceptions.NoSuchBoard as e:
                        if i == 1:
                            raise e
                        need_continue = True
                    except exceptions.NoMatchTargetError as e:
                        if i == 1:
                            raise e
                        need_continue = True
                    except exceptions.ConnectionClosed as e:
                        if i == 1:
                            raise e
                        log.log(self.config, log.level.INFO,
                                i18n.RestoreConnection)
                        self._login(self._ID, self._Password,
                                    self.config.kick_other_login)
                        need_continue = True
                    except exceptions.UseTooManyResources as e:
                        if i == 1:
                            raise e
                        log.log(self.config, log.level.INFO,
                                i18n.RestoreConnection)
                        self._login(self._ID, self._Password,
                                    self.config.kick_other_login)
                        need_continue = True

                    if post is None:
                        need_continue = True
                    elif not post.pass_format_check:
                        need_continue = True

                    if need_continue:
                        log.log(self.config, log.level.DEBUG,
                                'Wait for retry repost')
                        time.sleep(0.1)
                        continue

                    break

                if self.config.log_level == log.level.INFO:
                    PB.update(index - start_index)
                if post is None:
                    error_post_list.append(index)
                    continue
                if not post.pass_format_check:
                    if post.aid is not None:
                        error_post_list.append(post.aid)
                    else:
                        error_post_list.append(index)
                    continue
                if post.delete_status != data_type.post_delete_status.NOT_DELETED:
                    del_post_list.append(index)
                post_handler(post)
            if self.config.log_level == log.level.INFO:
                PB.finish()

            return error_post_list, del_post_list

        else:
            if self.config.host == data_type.host_type.PTT2:
                raise exceptions.HostNotSupport(
                    lib_util.get_current_func_name())

            # 網頁版本爬蟲
            # https://www.ptt.cc/bbs/index.html

            # 1. 取得總共有幾頁 MaxPage
            newest_index = self._get_newest_index(data_type.index_type.WEB,
                                                  board=board)
            # 2. 檢查 StartPage 跟 EndPage 有沒有在 1 ~ MaxPage 之間

            check_value.check_index_range(self.config,
                                          'StartPage',
                                          start_page,
                                          'EndPage',
                                          end_page,
                                          max_value=newest_index)

            # 3. 把每篇文章(包括被刪除文章)欄位解析出來組合成 data_type.PostInfo
            error_post_list = []
            del_post_list = []
            # PostAID = ""
            _url = 'https://www.ptt.cc/bbs/'
            index = str(newest_index)
            if self.config.log_level == log.level.INFO:
                PB = progressbar.ProgressBar(max_value=end_page - start_page +
                                             1,
                                             redirect_stdout=True)

            def deleted_post(post_title):
                if post_title.startswith('('):
                    if '本文' in post_title:
                        return data_type.post_delete_status.AUTHOR
                    elif post_title.startswith('(已被'):
                        return data_type.post_delete_status.MODERATOR
                    else:
                        return data_type.post_delete_status.UNKNOWN
                else:
                    return data_type.post_delete_status.NOT_DELETED

            for index in range(start_page, newest_index + 1):
                log.show_value(self.config, log.level.DEBUG, 'CurrentPage',
                               index)

                url = _url + board + '/index' + str(index) + '.html'
                r = requests.get(url, cookies={'over18': '1'})
                if r.status_code != requests.codes.ok:
                    raise exceptions.NoSuchBoard(self.config, board)
                soup = BeautifulSoup(r.text, 'html.parser')

                for div in soup.select('div.r-ent'):
                    web = div.select('div.title a')
                    post = {
                        'author':
                        div.select('div.author')[0].text,
                        'title':
                        div.select('div.title')[0].text.strip('\n').strip(),
                        'web':
                        web[0].get('href') if web else ''
                    }
                    if post['title'].startswith('('):
                        del_post_list.append(post['title'])
                        if post['title'].startswith('(本文'):
                            if '[' in post['title']:
                                post['author'] = post['title'].split(
                                    '[')[1].split(']')[0]
                            else:
                                post['author'] = post['title'].split(
                                    '<')[1].split('>')[0]
                        else:
                            post['author'] = post['title'].split('<')[1].split(
                                '>')[0]

                    post = data_type.PostInfo(
                        board=board,
                        author=post['author'],
                        title=post['title'],
                        web_url='https://www.ptt.cc' + post['web'],
                        delete_status=deleted_post(post['title']))
                    post_handler(post)

                if self.config.log_level == log.level.INFO:
                    PB.update(index - start_page)

            log.show_value(self.config, log.level.DEBUG, 'DelPostList',
                           del_post_list)

            # 4. 把組合出來的 Post 塞給 handler

            # 5. 顯示 progress bar
            if self.config.log_level == log.level.INFO:
                PB.finish()

            return error_post_list, del_post_list
예제 #4
0
def get_post(
        api,
        board: str,
        post_aid: str = None,
        post_index: int = 0,
        search_type: int = 0,
        search_condition: str = None,
        query: bool = False) -> data_type.PostInfo:
    cmd_list = []
    cmd_list.append(command.GoMainMenu)
    cmd_list.append('qs')
    cmd_list.append(board)
    cmd_list.append(command.Enter)
    cmd_list.append(command.Ctrl_C * 2)
    cmd_list.append(command.Space)

    if post_aid is not None:
        cmd_list.append('#' + post_aid)

    elif post_index != 0:
        if search_condition is not None:
            if search_type == data_type.post_search_type.KEYWORD:
                cmd_list.append('/')
            elif search_type == data_type.post_search_type.AUTHOR:
                cmd_list.append('a')
            elif search_type == data_type.post_search_type.PUSH:
                cmd_list.append('Z')
            elif search_type == data_type.post_search_type.MARK:
                cmd_list.append('G')
            elif search_type == data_type.post_search_type.MONEY:
                cmd_list.append('A')

            cmd_list.append(search_condition)
            cmd_list.append(command.Enter)

        cmd_list.append(str(post_index))

    cmd_list.append(command.Enter)
    cmd_list.append(command.QueryPost)

    cmd = ''.join(cmd_list)

    target_list = [
        connect_core.TargetUnit(
            [
                i18n.CatchPost,
                i18n.Success,
            ],
            screens.Target.QueryPost,
            break_detect=True,
            refresh=False,
            log_level=log.level.DEBUG
        ),
        connect_core.TargetUnit(
            [
                i18n.PostDeleted,
                i18n.Success,
            ],
            screens.Target.InBoard,
            break_detect=True,
            log_level=log.level.DEBUG
        ),
        connect_core.TargetUnit(
            i18n.NoSuchBoard,
            screens.Target.MainMenu_Exiting,
            exceptions_=exceptions.NoSuchBoard(api.config, board)
        ),
    ]

    index = api.connect_core.send(cmd, target_list)
    ori_screen = api.connect_core.get_screen_queue()[-1]

    post_author = None
    post_title = None
    if index < 0 or index == 1:
        # 文章被刪除
        log.log(api.config, log.level.DEBUG, i18n.PostDeleted)

        log.show_value(
            api.config,
            log.level.DEBUG,
            'OriScreen',
            ori_screen
        )

        cursor_line = [line for line in ori_screen.split(
            '\n') if line.startswith(api.cursor)]

        if len(cursor_line) != 1:
            raise exceptions.UnknownError(ori_screen)

        cursor_line = cursor_line[0]
        log.show_value(
            api.config,
            log.level.DEBUG,
            'CursorLine',
            cursor_line
        )

        pattern = re.compile('[\d]+\/[\d]+')
        pattern_result = pattern.search(cursor_line)
        if pattern_result is None:
            list_date = None
        else:
            list_date = pattern_result.group(0)
            list_date = list_date[-5:]

        pattern = re.compile('\[[\w]+\]')
        pattern_result = pattern.search(cursor_line)
        if pattern_result is not None:
            post_del_status = data_type.post_delete_status.AUTHOR
        else:
            pattern = re.compile('<[\w]+>')
            pattern_result = pattern.search(cursor_line)
            post_del_status = data_type.post_delete_status.MODERATOR

        # > 79843     9/11 -             □ (本文已被吃掉)<
        # > 76060     8/28 -             □ (本文已被刪除) [weida7332]
        # print(f'O=>{CursorLine}<')
        if pattern_result is not None:
            post_author = pattern_result.group(0)[1:-1]
        else:
            post_author = None
            post_del_status = data_type.post_delete_status.UNKNOWN

        log.show_value(api.config, log.level.DEBUG, 'ListDate', list_date)
        log.show_value(api.config, log.level.DEBUG,
                       'PostAuthor', post_author)
        log.show_value(api.config, log.level.DEBUG,
                       'post_del_status', post_del_status)

        return data_type.PostInfo(
            board=board,
            author=post_author,
            list_date=list_date,
            delete_status=post_del_status,
            format_check=True
        )

    elif index == 0:

        lock_post = False
        try:
            cursor_line = [line for line in ori_screen.split(
                '\n') if line.strip().startswith(api.cursor)][0]
        except Exception as e:
            print(api.cursor)
            print(ori_screen)
            raise e

        post_author = cursor_line
        if '□' in post_author:
            post_author = post_author[:post_author.find('□')].strip()
        elif 'R:' in post_author:
            post_author = post_author[:post_author.find('R:')].strip()
        elif ' 轉 ' in post_author:
            post_author = post_author[:post_author.find('轉')].strip()
        elif ' 鎖 ' in post_author:
            post_author = post_author[:post_author.find('鎖')].strip()
            lock_post = True
        post_author = post_author[post_author.rfind(' '):].strip()

        post_title = cursor_line
        if ' □ ' in post_title:
            post_title = post_title[post_title.find('□') + 1:].strip()
        elif ' R:' in post_title:
            post_title = post_title[post_title.find('R:'):].strip()
        elif ' 轉 ' in post_title:
            # print(f'[{PostTitle}]=========>')
            post_title = post_title[post_title.find('轉') + 1:].strip()
            post_title = f'Fw: {post_title}'
            # print(f'=========>[{PostTitle}]')
        elif ' 鎖 ' in post_title:
            post_title = post_title[post_title.find('鎖') + 1:].strip()

        ori_screen_temp = ori_screen[ori_screen.find('┌──────────'):]
        ori_screen_temp = ori_screen_temp[:ori_screen_temp.find(
            '└─────────────')
                          ]

        aid_line = [line for line in ori_screen.split(
            '\n') if line.startswith('│ 文章代碼(AID)')]

        if len(aid_line) == 1:
            aid_line = aid_line[0]
            pattern = re.compile('#[\w|-]+')
            pattern_result = pattern.search(aid_line)
            post_aid = pattern_result.group(0)[1:]

        pattern = re.compile('文章網址: https:[\S]+html')
        pattern_result = pattern.search(ori_screen_temp)
        if pattern_result is None:
            post_web = None
        else:
            post_web = pattern_result.group(0)[6:]

        pattern = re.compile('這一篇文章值 [\d]+ Ptt幣')
        pattern_result = pattern.search(ori_screen_temp)
        if pattern_result is None:
            # 特殊文章無價格
            post_money = -1
        else:
            post_money = pattern_result.group(0)[7:]
            post_money = post_money[:post_money.find(' ')]
            post_money = int(post_money)

        pattern = re.compile('[\d]+\/[\d]+')
        pattern_result = pattern.search(cursor_line)
        if pattern_result is None:
            list_date = None
        else:
            list_date = pattern_result.group(0)
            list_date = list_date[-5:]
        # print(list_date)

        # >  7485   9 8/09 CodingMan    □ [閒聊] PTT Library 更新
        # > 79189 M 1 9/17 LittleCalf   □ [公告] 禁言退文公告
        # >781508 +爆 9/17 jodojeda     □ [新聞] 國人吃魚少 學者:應把吃魚當成輕鬆愉快
        # >781406 +X1 9/17 kingofage111 R: [申請] ReDmango 請辭Gossiping板主職務

        if post_index == 0:
            pattern = re.compile('[\d]+')
            pattern_result = pattern.search(cursor_line)
            if pattern_result is not None:
                post_index = int(pattern_result.group(0))

        push_number = cursor_line
        # print(f'2>{push_number}<')
        push_number = push_number[7:11]
        # print(PushNumber)
        push_number = push_number.split(' ')
        # print(PushNumber)
        push_number = list(filter(None, push_number))
        # print(PushNumber)

        if len(push_number) == 0:
            push_number = None
        else:
            push_number = push_number[-1]
            # print(PushNumber)

            if push_number.startswith('+') or push_number.startswith('~'):
                push_number = push_number[1:]
                # print(PushNumber)
            if push_number.lower().startswith('m'):
                push_number = push_number[1:]
                # print(PushNumber)
            if push_number.lower().startswith('!'):
                push_number = push_number[1:]

            if push_number.lower().startswith('s'):
                push_number = push_number[1:]

            if push_number.lower().startswith('='):
                push_number = push_number[1:]

            if len(push_number) == 0:
                push_number = None

        # print(PushNumber)
        log.show_value(api.config, log.level.DEBUG,
                       'PostAuthor', post_author)
        log.show_value(api.config, log.level.DEBUG, 'PostTitle', post_title)
        log.show_value(api.config, log.level.DEBUG, 'PostAID', post_aid)
        log.show_value(api.config, log.level.DEBUG, 'PostWeb', post_web)
        log.show_value(api.config, log.level.DEBUG, 'PostMoney', post_money)
        log.show_value(api.config, log.level.DEBUG, 'ListDate', list_date)
        log.show_value(api.config, log.level.DEBUG,
                       'PushNumber', push_number)

        if lock_post:
            post = data_type.PostInfo(
                board=board,
                aid=post_aid,
                index=post_index,
                author=post_author,
                title=post_title,
                web_url=post_web,
                money=post_money,
                list_date=list_date,
                format_check=True,
                push_number=push_number,
                lock=True,
            )
            return post

    if query:
        post = data_type.PostInfo(
            board=board,
            aid=post_aid,
            index=post_index,
            author=post_author,
            title=post_title,
            web_url=post_web,
            money=post_money,
            list_date=list_date,
            format_check=True,
            push_number=push_number,
        )
        return post

    origin_post, has_control_code = _api_util.get_content(api)

    if origin_post is None:
        post = data_type.PostInfo(
            board=board,
            aid=post_aid,
            index=post_index,
            author=post_author,
            title=post_title,
            web_url=post_web,
            money=post_money,
            list_date=list_date,
            control_code=has_control_code,
            format_check=False,
            push_number=push_number,
            unconfirmed=api.Unconfirmed,
        )
        return post

    # print('=' * 20)
    # print()
    # print('=' * 20)

    content_start = '───────────────────────────────────────'
    content_end = []
    content_end.append('--\n※ 發信站: 批踢踢實業坊(ptt.cc)')
    content_end.append('--\n※ 發信站: 批踢踢兔(ptt2.cc)')
    content_end.append('--\n※ 發信站: 新批踢踢(ptt2.twbbs.org.tw)')

    post_author_pattern_new = re.compile('作者  (.+) 看板')
    post_author_pattern_old = re.compile('作者  (.+)')
    board_pattern = re.compile('看板  (.+)')

    post_date = None
    post_content = []
    ip = None
    location = None
    push_list = []

    # 格式確認,亂改的我也沒辦法Q_Q
    origin_post_lines = origin_post.split('\n')

    author_line = origin_post_lines[0]

    if board.lower() == 'allpost':
        board_line = author_line[author_line.find(')') + 1:]
        pattern_result = board_pattern.search(board_line)
        if pattern_result is not None:
            board_temp = post_author = pattern_result.group(0)
            board_temp = board_temp[2:].strip()
            if len(board_temp) > 0:
                board = board_temp
                log.show_value(
                    api.config,
                    log.level.DEBUG,
                    i18n.Board,
                    board
                )
    pattern_result = post_author_pattern_new.search(author_line)
    if pattern_result is not None:
        post_author = pattern_result.group(0)
        post_author = post_author[:post_author.rfind(')') + 1]
    else:
        pattern_result = post_author_pattern_old.search(author_line)
        if pattern_result is None:
            log.show_value(
                api.config,
                log.level.DEBUG,
                i18n.SubstandardPost,
                i18n.Author
            )
            post = data_type.PostInfo(
                board=board,
                aid=post_aid,
                index=post_index,
                author=post_author,
                date=post_date,
                title=post_title,
                web_url=post_web,
                money=post_money,
                content=post_content,
                ip=ip,
                push_list=push_list,
                list_date=list_date,
                control_code=has_control_code,
                format_check=False,
                location=location,
                push_number=push_number,
                origin_post=origin_post,
                unconfirmed=api.Unconfirmed,
            )
            return post
        post_author = pattern_result.group(0)
        post_author = post_author[:post_author.rfind(')') + 1]
    post_author = post_author[4:].strip()

    log.show_value(
        api.config,
        log.level.DEBUG,
        i18n.Author,
        post_author
    )

    post_title_pattern = re.compile('標題  (.+)')

    title_line = origin_post_lines[1]
    pattern_result = post_title_pattern.search(title_line)
    if pattern_result is None:
        log.show_value(
            api.config,
            log.level.DEBUG,
            i18n.SubstandardPost,
            i18n.Title
        )
        post = data_type.PostInfo(
            board=board,
            aid=post_aid,
            index=post_index,
            author=post_author,
            date=post_date,
            title=post_title,
            web_url=post_web,
            money=post_money,
            content=post_content,
            ip=ip,
            push_list=push_list,
            list_date=list_date,
            control_code=has_control_code,
            format_check=False,
            location=location,
            push_number=push_number,
            origin_post=origin_post,
            unconfirmed=api.Unconfirmed,
        )
        return post
    post_title = pattern_result.group(0)
    post_title = post_title[4:].strip()

    log.show_value(
        api.config,
        log.level.DEBUG,
        i18n.Title,
        post_title
    )

    post_date_pattern = re.compile('時間  (.+)')
    date_line = origin_post_lines[2]
    pattern_result = post_date_pattern.search(date_line)
    if pattern_result is None:
        log.show_value(
            api.config,
            log.level.DEBUG,
            i18n.SubstandardPost,
            i18n.Date
        )
        post = data_type.PostInfo(
            board=board,
            aid=post_aid,
            index=post_index,
            author=post_author,
            date=post_date,
            title=post_title,
            web_url=post_web,
            money=post_money,
            content=post_content,
            ip=ip,
            push_list=push_list,
            list_date=list_date,
            control_code=has_control_code,
            format_check=False,
            location=location,
            push_number=push_number,
            origin_post=origin_post,
            unconfirmed=api.Unconfirmed,
        )
        return post
    post_date = pattern_result.group(0)
    post_date = post_date[4:].strip()

    log.show_value(
        api.config,
        log.level.DEBUG,
        i18n.Date,
        post_date
    )

    content_fail = True
    if content_start not in origin_post:
        # print('Type 1')
        content_fail = True
    else:
        post_content = origin_post
        post_content = post_content[
                       post_content.find(content_start) +
                       len(content_start) + 1:
                       ]
        # print('Type 2')
        # print(f'PostContent [{PostContent}]')
        for EC in content_end:
            # + 3 = 把 --\n 拿掉
            # print(f'EC [{EC}]')
            if EC in post_content:
                content_fail = False

                post_content = post_content[
                               :post_content.rfind(EC) + 3
                               ]
                origin_post_lines = origin_post[origin_post.find(EC):]
                # post_content = post_content.strip()
                origin_post_lines = origin_post_lines.split('\n')
                break

    if content_fail:
        log.show_value(
            api.config,
            log.level.DEBUG,
            i18n.SubstandardPost,
            i18n.Content
        )
        post = data_type.PostInfo(
            board=board,
            aid=post_aid,
            index=post_index,
            author=post_author,
            date=post_date,
            title=post_title,
            web_url=post_web,
            money=post_money,
            content=post_content,
            ip=ip,
            push_list=push_list,
            list_date=list_date,
            control_code=has_control_code,
            format_check=False,
            location=location,
            push_number=push_number,
            origin_post=origin_post,
            unconfirmed=api.Unconfirmed,
        )
        return post

    log.show_value(
        api.config,
        log.level.DEBUG,
        i18n.Content,
        post_content
    )

    info_lines = [
        line for line in origin_post_lines if line.startswith('※') or
                                              line.startswith('◆')
    ]

    pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+')
    pattern_p2 = re.compile('[\d]+-[\d]+-[\d]+-[\d]+')
    for line in reversed(info_lines):
        log.show_value(
            api.config,
            log.level.DEBUG,
            'IP Line',
            line
        )

        # type 1
        # ※ 編輯: CodingMan (111.243.146.98 臺灣)
        # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 111.243.146.98 (臺灣)

        # type 2
        # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 116.241.32.178
        # ※ 編輯: kill77845 (114.136.55.237), 12/08/2018 16:47:59

        # type 3
        # ※ 發信站: 批踢踢實業坊(ptt.cc)
        # ◆ From: 211.20.78.69
        # ※ 編輯: JCC             來自: 211.20.78.69         (06/20 10:22)
        # ※ 編輯: JCC (118.163.28.150), 12/03/2015 14:25:35

        pattern_result = pattern.search(line)
        if pattern_result is not None:
            ip = pattern_result.group(0)
            location_temp = line[line.find(ip) + len(ip):].strip()
            location_temp = location_temp.replace('(', '')
            location_temp = location_temp[:location_temp.rfind(')')]
            location_temp = location_temp.strip()
            # print(f'=>[{LocationTemp}]')
            if ' ' not in location_temp and len(location_temp) > 0:
                location = location_temp
                log.show_value(api.config, log.level.DEBUG,
                               'Location', location)
            break

        pattern_result = pattern_p2.search(line)
        if pattern_result is not None:
            ip = pattern_result.group(0)
            ip = ip.replace('-', '.')
            # print(f'IP -> [{IP}]')
            break
    if api.config.host == data_type.host_type.PTT1:
        if ip is None:
            log.show_value(
                api.config,
                log.level.DEBUG,
                i18n.SubstandardPost,
                'IP'
            )
            post = data_type.PostInfo(
                board=board,
                aid=post_aid,
                index=post_index,
                author=post_author,
                date=post_date,
                title=post_title,
                web_url=post_web,
                money=post_money,
                content=post_content,
                ip=ip,
                push_list=push_list,
                list_date=list_date,
                control_code=has_control_code,
                format_check=False,
                location=location,
                push_number=push_number,
                origin_post=origin_post,
                unconfirmed=api.Unconfirmed,
            )
            return post
    log.show_value(api.config, log.level.DEBUG, 'IP', ip)

    push_author_pattern = re.compile('[推|噓|→] [\w| ]+:')
    push_date_pattern = re.compile('[\d]+/[\d]+ [\d]+:[\d]+')
    push_ip_pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+')

    push_list = []

    for line in origin_post_lines:
        if line.startswith('推'):
            push_type = data_type.push_type.PUSH
        elif line.startswith('噓 '):
            push_type = data_type.push_type.BOO
        elif line.startswith('→ '):
            push_type = data_type.push_type.ARROW
        else:
            continue

        result = push_author_pattern.search(line)
        if result is None:
            # 不符合推文格式
            continue
        push_author = result.group(0)[2:-1].strip()
        log.show_value(api.config, log.level.DEBUG, [
            i18n.Push,
            i18n.ID,
        ],
                       push_author
                       )

        result = push_date_pattern.search(line)
        if result is None:
            continue
        push_date = result.group(0)
        log.show_value(api.config, log.level.DEBUG, [
            i18n.Push,
            i18n.Date,
        ],
                       push_date
                       )

        push_ip = None
        result = push_ip_pattern.search(line)
        if result is not None:
            push_ip = result.group(0)
            log.show_value(
                api.config,
                log.level.DEBUG,
                [
                    i18n.Push,
                    'IP',
                ],
                push_ip
            )

        push_content = line[
                       line.find(push_author) + len(push_author):
                       ]
        # PushContent = PushContent.replace(PushDate, '')

        if api.config.host == data_type.host_type.PTT1:
            push_content = push_content[
                           :push_content.rfind(push_date)
                           ]
        else:
            # → CodingMan:What is Ptt?                                       推 10/04 13:25
            push_content = push_content[
                           :push_content.rfind(push_date) - 2
                           ]
        if push_ip is not None:
            push_content = push_content.replace(push_ip, '')
        push_content = push_content[
                       push_content.find(':') + 1:
                       ].strip()
        log.show_value(
            api.config,
            log.level.DEBUG, [
                i18n.Push,
                i18n.Content,
            ],
            push_content
        )

        current_push = data_type.PushInfo(
            push_type,
            push_author,
            push_content,
            push_ip,
            push_date
        )
        push_list.append(current_push)

    post = data_type.PostInfo(
        board=board,
        aid=post_aid,
        index=post_index,
        author=post_author,
        date=post_date,
        title=post_title,
        web_url=post_web,
        money=post_money,
        content=post_content,
        ip=ip,
        push_list=push_list,
        list_date=list_date,
        control_code=has_control_code,
        format_check=True,
        location=location,
        push_number=push_number,
        origin_post=origin_post,
        unconfirmed=api.Unconfirmed,
    )
    return post
예제 #5
0
def get_post(api,
             board: str,
             post_aid: str = None,
             post_index: int = 0,
             search_type: int = 0,
             search_condition: str = None,
             search_list: list = None,
             query: bool = False) -> data_type.PostInfo:
    api._goto_board(board)

    cmd_list = list()

    if post_aid is not None:
        cmd_list.append('#' + post_aid)

    elif post_index != 0:
        if search_condition is not None:
            if search_type == data_type.post_search_type.KEYWORD:
                cmd_list.append('/')
            elif search_type == data_type.post_search_type.AUTHOR:
                cmd_list.append('a')
            elif search_type == data_type.post_search_type.PUSH:
                cmd_list.append('Z')
            elif search_type == data_type.post_search_type.MARK:
                cmd_list.append('G')
            elif search_type == data_type.post_search_type.MONEY:
                cmd_list.append('A')

            cmd_list.append(search_condition)
            cmd_list.append(command.Enter)

        if search_list is not None:
            for search_type_, search_condition_ in search_list:

                if search_type_ == data_type.post_search_type.KEYWORD:
                    cmd_list.append('/')
                elif search_type_ == data_type.post_search_type.AUTHOR:
                    cmd_list.append('a')
                elif search_type_ == data_type.post_search_type.PUSH:
                    cmd_list.append('Z')
                elif search_type_ == data_type.post_search_type.MARK:
                    cmd_list.append('G')
                elif search_type_ == data_type.post_search_type.MONEY:
                    cmd_list.append('A')

                cmd_list.append(search_condition_)
                cmd_list.append(command.Enter)

        cmd_list.append(str(max(1, post_index - 100)))
        cmd_list.append(command.Enter)
        cmd_list.append(str(post_index))

    cmd_list.append(command.Enter)
    cmd_list.append(command.QueryPost)

    cmd = ''.join(cmd_list)

    target_list = [
        connect_core.TargetUnit([
            i18n.CatchPost,
            i18n.Success,
        ],
                                screens.Target.QueryPost,
                                break_detect=True,
                                refresh=False,
                                log_level=log.level.DEBUG),
        connect_core.TargetUnit([
            i18n.PostDeleted,
            i18n.Success,
        ],
                                screens.Target.InBoard,
                                break_detect=True,
                                log_level=log.level.DEBUG),
        connect_core.TargetUnit(i18n.NoSuchBoard,
                                screens.Target.MainMenu_Exiting,
                                exceptions_=exceptions.NoSuchBoard(
                                    api.config, board)),
    ]

    index = api.connect_core.send(cmd, target_list)
    ori_screen = api.connect_core.get_screen_queue()[-1]

    post_author = None
    post_title = None
    if index < 0 or index == 1:
        # 文章被刪除
        log.log(api.config, log.level.DEBUG, i18n.PostDeleted)

        log.show_value(api.config, log.level.DEBUG, 'OriScreen', ori_screen)

        cursor_line = [
            line for line in ori_screen.split('\n')
            if line.startswith(api.cursor)
        ]

        if len(cursor_line) != 1:
            raise exceptions.UnknownError(ori_screen)

        cursor_line = cursor_line[0]
        log.show_value(api.config, log.level.DEBUG, 'CursorLine', cursor_line)

        pattern = re.compile('[\d]+\/[\d]+')
        pattern_result = pattern.search(cursor_line)
        if pattern_result is None:
            list_date = None
        else:
            list_date = pattern_result.group(0)
            list_date = list_date[-5:]

        pattern = re.compile('\[[\w]+\]')
        pattern_result = pattern.search(cursor_line)
        if pattern_result is not None:
            post_del_status = data_type.post_delete_status.AUTHOR
        else:
            pattern = re.compile('<[\w]+>')
            pattern_result = pattern.search(cursor_line)
            post_del_status = data_type.post_delete_status.MODERATOR

        # > 79843     9/11 -             □ (本文已被吃掉)<
        # > 76060     8/28 -             □ (本文已被刪除) [weida7332]
        # print(f'O=>{CursorLine}<')
        if pattern_result is not None:
            post_author = pattern_result.group(0)[1:-1]
        else:
            post_author = None
            post_del_status = data_type.post_delete_status.UNKNOWN

        log.show_value(api.config, log.level.DEBUG, 'ListDate', list_date)
        log.show_value(api.config, log.level.DEBUG, 'PostAuthor', post_author)
        log.show_value(api.config, log.level.DEBUG, 'post_del_status',
                       post_del_status)

        return data_type.PostInfo(board=board,
                                  author=post_author,
                                  list_date=list_date,
                                  delete_status=post_del_status,
                                  format_check=True)

    elif index == 0:

        lock_post, post_author, post_title, post_aid, post_web, post_money, list_date, push_number, post_index = \
            _api_util.parse_query_post(
                api,
                ori_screen)

        if lock_post:
            post = data_type.PostInfo(board=board,
                                      aid=post_aid,
                                      index=post_index,
                                      author=post_author,
                                      title=post_title,
                                      web_url=post_web,
                                      money=post_money,
                                      list_date=list_date,
                                      format_check=True,
                                      push_number=push_number,
                                      lock=True)

            return post

    if query:
        post = data_type.PostInfo(board=board,
                                  aid=post_aid,
                                  index=post_index,
                                  author=post_author,
                                  title=post_title,
                                  web_url=post_web,
                                  money=post_money,
                                  list_date=list_date,
                                  format_check=True,
                                  push_number=push_number)
        return post

    origin_post, has_control_code = _api_util.get_content(api)

    if origin_post is None:
        post = data_type.PostInfo(board=board,
                                  aid=post_aid,
                                  index=post_index,
                                  author=post_author,
                                  title=post_title,
                                  web_url=post_web,
                                  money=post_money,
                                  list_date=list_date,
                                  control_code=has_control_code,
                                  format_check=False,
                                  push_number=push_number,
                                  unconfirmed=api.Unconfirmed)
        return post

    # print('=' * 20)
    # print(origin_post)
    # print('=' * 20)

    content_start = '───────────────────────────────────────'
    content_end = list()
    content_end.append('--\n※ 發信站: 批踢踢實業坊')
    content_end.append('--\n※ 發信站: 批踢踢兔(ptt2.cc)')
    content_end.append('--\n※ 發信站: 新批踢踢(ptt2.twbbs.org.tw)')

    post_author_pattern_new = re.compile('作者  (.+) 看板')
    post_author_pattern_old = re.compile('作者  (.+)')
    board_pattern = re.compile('看板  (.+)')

    post_date = None
    post_content = list()
    ip = None
    location = None
    push_list = list()

    # 格式確認,亂改的我也沒辦法Q_Q
    origin_post_lines = origin_post.split('\n')

    author_line = origin_post_lines[0]

    if board.lower() == 'allpost':
        board_line = author_line[author_line.find(')') + 1:]
        pattern_result = board_pattern.search(board_line)
        if pattern_result is not None:
            board_temp = post_author = pattern_result.group(0)
            board_temp = board_temp[2:].strip()
            if len(board_temp) > 0:
                board = board_temp
                log.show_value(api.config, log.level.DEBUG, i18n.Board, board)
    pattern_result = post_author_pattern_new.search(author_line)
    if pattern_result is not None:
        post_author = pattern_result.group(0)
        post_author = post_author[:post_author.rfind(')') + 1]
    else:
        pattern_result = post_author_pattern_old.search(author_line)
        if pattern_result is None:
            log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost,
                           i18n.Author)
            post = data_type.PostInfo(
                board=board,
                aid=post_aid,
                index=post_index,
                author=post_author,
                date=post_date,
                title=post_title,
                web_url=post_web,
                money=post_money,
                content=post_content,
                ip=ip,
                push_list=push_list,
                list_date=list_date,
                control_code=has_control_code,
                format_check=False,
                location=location,
                push_number=push_number,
                origin_post=origin_post,
                unconfirmed=api.Unconfirmed,
            )
            return post
        post_author = pattern_result.group(0)
        post_author = post_author[:post_author.rfind(')') + 1]
    post_author = post_author[4:].strip()

    log.show_value(api.config, log.level.DEBUG, i18n.Author, post_author)

    post_title_pattern = re.compile('標題  (.+)')

    title_line = origin_post_lines[1]
    pattern_result = post_title_pattern.search(title_line)
    if pattern_result is None:
        log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost,
                       i18n.Title)

        post = data_type.PostInfo(board=board,
                                  aid=post_aid,
                                  index=post_index,
                                  author=post_author,
                                  date=post_date,
                                  title=post_title,
                                  web_url=post_web,
                                  money=post_money,
                                  content=post_content,
                                  ip=ip,
                                  push_list=push_list,
                                  list_date=list_date,
                                  control_code=has_control_code,
                                  format_check=False,
                                  location=location,
                                  push_number=push_number,
                                  origin_post=origin_post,
                                  unconfirmed=api.Unconfirmed)
        return post
    post_title = pattern_result.group(0)
    post_title = post_title[4:].strip()

    log.show_value(api.config, log.level.DEBUG, i18n.Title, post_title)

    post_date_pattern = re.compile('時間  .{24}')
    date_line = origin_post_lines[2]
    pattern_result = post_date_pattern.search(date_line)
    if pattern_result is None:
        log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost,
                       i18n.Date)
        post = data_type.PostInfo(
            board=board,
            aid=post_aid,
            index=post_index,
            author=post_author,
            date=post_date,
            title=post_title,
            web_url=post_web,
            money=post_money,
            content=post_content,
            ip=ip,
            push_list=push_list,
            list_date=list_date,
            control_code=has_control_code,
            format_check=False,
            location=location,
            push_number=push_number,
            origin_post=origin_post,
            unconfirmed=api.Unconfirmed,
        )
        return post
    post_date = pattern_result.group(0)
    post_date = post_date[4:].strip()

    log.show_value(api.config, log.level.DEBUG, i18n.Date, post_date)

    content_fail = True
    if content_start not in origin_post:
        # print('Type 1')
        content_fail = True
    else:
        post_content = origin_post
        post_content = post_content[post_content.find(content_start) +
                                    len(content_start) + 1:]
        # print('Type 2')
        # print(f'PostContent [{PostContent}]')
        for EC in content_end:
            # + 3 = 把 --\n 拿掉
            # print(f'EC [{EC}]')
            if EC in post_content:
                content_fail = False

                post_content = post_content[:post_content.rfind(EC) + 3]
                origin_post_lines = origin_post[origin_post.find(EC):]
                # post_content = post_content.strip()
                origin_post_lines = origin_post_lines.split('\n')
                break

    if content_fail:
        log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost,
                       i18n.Content)
        post = data_type.PostInfo(
            board=board,
            aid=post_aid,
            index=post_index,
            author=post_author,
            date=post_date,
            title=post_title,
            web_url=post_web,
            money=post_money,
            content=post_content,
            ip=ip,
            push_list=push_list,
            list_date=list_date,
            control_code=has_control_code,
            format_check=False,
            location=location,
            push_number=push_number,
            origin_post=origin_post,
            unconfirmed=api.Unconfirmed,
        )
        return post

    log.show_value(api.config, log.level.DEBUG, i18n.Content, post_content)

    info_lines = [
        line for line in origin_post_lines
        if line.startswith('※') or line.startswith('◆')
    ]

    pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+')
    pattern_p2 = re.compile('[\d]+-[\d]+-[\d]+-[\d]+')
    for line in reversed(info_lines):
        log.show_value(api.config, log.level.DEBUG, 'IP Line', line)

        # type 1
        # ※ 編輯: CodingMan (111.243.146.98 臺灣)
        # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 111.243.146.98 (臺灣)

        # type 2
        # ※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 116.241.32.178
        # ※ 編輯: kill77845 (114.136.55.237), 12/08/2018 16:47:59

        # type 3
        # ※ 發信站: 批踢踢實業坊(ptt.cc)
        # ◆ From: 211.20.78.69
        # ※ 編輯: JCC             來自: 211.20.78.69         (06/20 10:22)
        # ※ 編輯: JCC (118.163.28.150), 12/03/2015 14:25:35

        pattern_result = pattern.search(line)
        if pattern_result is not None:
            ip = pattern_result.group(0)
            location_temp = line[line.find(ip) + len(ip):].strip()
            location_temp = location_temp.replace('(', '')
            location_temp = location_temp[:location_temp.rfind(')')]
            location_temp = location_temp.strip()
            # print(f'=>[{LocationTemp}]')
            if ' ' not in location_temp and len(location_temp) > 0:
                location = location_temp
                log.show_value(api.config, log.level.DEBUG, 'Location',
                               location)
            break

        pattern_result = pattern_p2.search(line)
        if pattern_result is not None:
            ip = pattern_result.group(0)
            ip = ip.replace('-', '.')
            # print(f'IP -> [{IP}]')
            break
    if api.config.host == data_type.host_type.PTT1:
        if ip is None:
            log.show_value(api.config, log.level.DEBUG, i18n.SubstandardPost,
                           'IP')
            post = data_type.PostInfo(
                board=board,
                aid=post_aid,
                index=post_index,
                author=post_author,
                date=post_date,
                title=post_title,
                web_url=post_web,
                money=post_money,
                content=post_content,
                ip=ip,
                push_list=push_list,
                list_date=list_date,
                control_code=has_control_code,
                format_check=False,
                location=location,
                push_number=push_number,
                origin_post=origin_post,
                unconfirmed=api.Unconfirmed,
            )
            return post
    log.show_value(api.config, log.level.DEBUG, 'IP', ip)

    push_author_pattern = re.compile('[推|噓|→] [\w| ]+:')
    push_date_pattern = re.compile('[\d]+/[\d]+ [\d]+:[\d]+')
    push_ip_pattern = re.compile('[\d]+\.[\d]+\.[\d]+\.[\d]+')

    push_list = list()

    for line in origin_post_lines:
        if line.startswith('推'):
            push_type = data_type.push_type.PUSH
        elif line.startswith('噓 '):
            push_type = data_type.push_type.BOO
        elif line.startswith('→ '):
            push_type = data_type.push_type.ARROW
        else:
            continue

        result = push_author_pattern.search(line)
        if result is None:
            # 不符合推文格式
            continue
        push_author = result.group(0)[2:-1].strip()
        log.show_value(api.config, log.level.DEBUG, [
            i18n.Push,
            i18n.ID,
        ], push_author)

        result = push_date_pattern.search(line)
        if result is None:
            continue
        push_date = result.group(0)
        log.show_value(api.config, log.level.DEBUG, [
            i18n.Push,
            i18n.Date,
        ], push_date)

        push_ip = None
        result = push_ip_pattern.search(line)
        if result is not None:
            push_ip = result.group(0)
            log.show_value(api.config, log.level.DEBUG, [
                i18n.Push,
                'IP',
            ], push_ip)

        push_content = line[line.find(push_author) + len(push_author):]
        # PushContent = PushContent.replace(PushDate, '')

        if api.config.host == data_type.host_type.PTT1:
            push_content = push_content[:push_content.rfind(push_date)]
        else:
            # → CodingMan:What is Ptt?                                       推 10/04 13:25
            push_content = push_content[:push_content.rfind(push_date) - 2]
        if push_ip is not None:
            push_content = push_content.replace(push_ip, '')
        push_content = push_content[push_content.find(':') + 1:].strip()
        log.show_value(api.config, log.level.DEBUG, [
            i18n.Push,
            i18n.Content,
        ], push_content)

        current_push = data_type.PushInfo(push_type, push_author, push_content,
                                          push_ip, push_date)
        push_list.append(current_push)

    post = data_type.PostInfo(
        board=board,
        aid=post_aid,
        index=post_index,
        author=post_author,
        date=post_date,
        title=post_title,
        web_url=post_web,
        money=post_money,
        content=post_content,
        ip=ip,
        push_list=push_list,
        list_date=list_date,
        control_code=has_control_code,
        format_check=True,
        location=location,
        push_number=push_number,
        origin_post=origin_post,
        unconfirmed=api.Unconfirmed,
    )
    return post