Пример #1
0
    def wca_url_request_handler(self, target_url, post_data, timeout,
                                target_page_word, logpath):
        """Universal URL request format handler

        @@API that allows external calls
        If no need log, set log path to None
        :param target_url:          target request url
        :param post_data:           post way data
        :param timeout:             request timeout, suggest 30s
        :param target_page_word:    target page symbol word
        :param logpath:             log save path
        :return:                    request result response(raw)
        """
        try:
            response = self.opener.open(fullurl=target_url,
                                        data=post_data,
                                        timeout=timeout)
        except Exception as e:
            log_content = dl.BR_CB('%s response failed, error: %s' %
                                   (target_page_word, str(e)))
            self.wca_logprowork(logpath, log_content)
            exit(dl.PUB_E_RESPONSE_FAIL)  # can't process error, directly exit

        if response.getcode() == dl.HTTP_REP_OK_CODE:
            log_content = target_page_word + ' response ok'
        else:
            log_content = dl.BR_CB(target_page_word +
                                   ' return code %d' % response.getcode())
        self.wca_logprowork(logpath, log_content)

        return response
Пример #2
0
    def wca_camouflage_login(self):
        """Camouflage browser to login

        If login failed, program will exit here
        @@API that allows external calls
        :return:        status code
        """
        if WkvCwApi._login_once_flag:
            return dl.PUB_E_OK
        else:
            WkvCwApi._login_once_flag = True

        if self._gatherpostkey() != dl.PUB_E_OK:
            exit(dl.PUB_E_RESPONSE_FAIL)

        cookie_jar = self._get_chrome_cookie(dl.local_cache_cookie_path,
                                             dl.HTTPS_HOST_URL)
        self.cookieHandler = urllib.request.HTTPCookieProcessor(cookie_jar)
        self.opener = urllib.request.build_opener(self.cookieHandler)
        urllib.request.install_opener(self.opener)

        response = self.wca_url_request_handler(
            target_url=dl.LOGIN_REQUEST_API_URL,
            post_data=self.postway_data,
            timeout=30,
            target_page_word='login',
            logpath=None)
        if response == dl.PUB_E_RESPONSE_FAIL:
            dl.LT_PRINT(
                dl.BR_CB('login response return a boolean FALSE, exit'))
            exit(dl.PUB_E_RESPONSE_FAIL)

        web_src = response.read().decode("UTF-8", "ignore")
        dl.LT_PRINT(
            dl.BY_CB('response source: %s' %
                     web_src.encode("UTF-8").decode("unicode_escape")))

        login_info_pattern = re.compile(dl.LOGIN_INFO_REGEX, re.S)
        response_info = re.findall(login_info_pattern, web_src)
        if response_info:
            if response_info[0] != 'false':
                # error false means no error
                dl.LT_PRINT(dl.BR_CB('login confirm raise a error, exit'))
                exit(dl.PUB_E_RESPONSE_FAIL)
            else:
                dl.LT_PRINT('login check response right')
        else:
            dl.LT_PRINT('login confirm response no error status')
            exit(dl.PUB_E_RESPONSE_FAIL)
Пример #3
0
    def rtn_gather_essential_info(page_opt, whole_nbr):
        """Get input image count

        If user input number more than whole number, set target count is whole number
        Only intercative mode call this function
        :param page_opt:      select ranktop ordinary or r18 mode
        :param whole_nbr:   whole ranking crawl count
        :return:            crawl images count
        """
        img_cnt = 0

        if page_opt == dl.PAGE_ORDINARY:
            label = 'ordinary'
        elif page_opt == dl.PAGE_R18:
            label = 'r18'
        elif page_opt == dl.PAGE_R18G:
            label = 'r18g'
        else:
            dl.nolog_raise_arguerr()
            return dl.PUB_E_PARAM_FAIL

        img_str = dl.LT_INPUT(dl.HL_CY('crawl %s valid target %d, enter you want: ' % (label, whole_nbr)))

        while not img_str.isdigit():
            img_str = dl.LT_INPUT(dl.HL_CY('input error, enter again(max is %d): ' % whole_nbr))
        img_cnt = int(img_str)
        if img_cnt <= 0:
            dl.LT_PRINT(dl.BR_CB('what the f**k is wrong with you?'))
            return dl.PUB_E_PARAM_FAIL

        if img_cnt > whole_nbr:
            img_cnt = whole_nbr

        return img_cnt
Пример #4
0
    def wca_commit_spansizer(whole_pattern, info_pattern, web_src):
        """A sizer for all of images in once commit item

        @@API that allows external calls
        After Pixiv 20181002 update, this method only support mode rtn
        :param whole_pattern:   whole info data regex compile pattern
        :param info_pattern:    image info regex compile pattern
        :param web_src:         webpage source
        :return:                original target url list & image info list dict
        """
        img_info_lst = []
        tgt_url_lst = []

        datasrc_pattern = re.compile(dl.DATASRC_REGEX, re.S)
        span_pattern = re.compile(dl.SPAN_REGEX, re.S)
        img_whole_info = re.findall(whole_pattern, web_src)

        # image have 3 format: jpg/png/gif
        # this crawler will give gif format up and crawl png or jpg
        # pixiv one repertory maybe have multi-images
        for item in img_whole_info:
            tmp_thumbnail = re.findall(datasrc_pattern, item)
            if not tmp_thumbnail:
                dl.LT_PRINT(dl.BR_CB('span sizer regex cannot get valid info'))
                return dl.PUB_E_FAIL

            thumbnail = tmp_thumbnail[0]
            judge_word = thumbnail[-18:]
            # check jpg/png or gif
            if judge_word == dl.JUDGE_NOGIF_WORD:
                span_word = re.findall(span_pattern, item)
                vaild_word = thumbnail[44:-18]

                # try to check multi-span images
                if len(span_word) != 0:
                    for _px in range(int(span_word[0])):
                        info = re.findall(info_pattern, item)[0]
                        img_info_lst.append(info)
                        # more pages point, range 0~span-1
                        target_url = dl.ORIGINAL_IMAGE_HEAD + vaild_word + dl.ORIGINAL_IMAGE_TAIL(
                            _px)
                        tgt_url_lst.append(target_url)
                # just only one picture in a commit
                else:
                    info = re.findall(info_pattern, item)[0]
                    img_info_lst.append(info)
                    # only _p0 page
                    target_url = dl.ORIGINAL_IMAGE_HEAD + vaild_word + dl.ORIGINAL_IMAGE_TAIL(
                        0)
                    tgt_url_lst.append(target_url)
            # give up gif format, or list is empty
            else:
                pass

        return {'url lst': tgt_url_lst, 'info lst': img_info_lst}
Пример #5
0
    def rtn_gather_rankingdata(self):
        """Crawl dailyRank list

        :return:        status code
        """
        response = self.wkv_cw_api.wca_url_request_handler(target_url=self.rtn_req_url,
                                                        post_data=self.wkv_cw_api.getway_data, 
                                                        timeout=30, 
                                                        target_page_word='rankpage',
                                                        logpath=self.logpath)

        # size info in webpage source
        web_src = response.read().decode("UTF-8", "ignore")
        imgitem_pattern = re.compile(dl.RANKING_SECTION_REGEX, re.S)
        info_pattern    = re.compile(dl.RANKING_INFO_REGEX, re.S)
        sizer_result    = self.wkv_cw_api.wca_commit_spansizer(imgitem_pattern, info_pattern, web_src)
        if sizer_result == dl.PUB_E_FAIL:
            return dl.PUB_E_FAIL
        url_lst         = sizer_result['url lst']
        img_info_lst    = sizer_result['info lst']

        # cut need image count to be target list
        valid_url_cnt = len(url_lst)
        if self.ir_mode == dl.MODE_INTERACTIVE:
            img_nbr = self.rtn_gather_essential_info(self.page_opt, valid_url_cnt)
            if img_nbr == dl.PUB_E_PARAM_FAIL:
                return dl.PUB_E_FAIL
        elif self.ir_mode == dl.MODE_SERVER:
            img_nbr = valid_url_cnt             # server mode directly get all of alive targets
            dl.LT_PRINT(dl.BY_CB('server mode auto crawl all of alive targets'))
        self.rtn_target_urls = url_lst[:img_nbr]

        log_content = dl.BY_CB('crawl ranking top ' + str(img_nbr) + ', target table:')
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
        image_info_table = PrettyTable(
            ["ImageNumber", "ImageID", "ImageTitle", "ImageID+PageNumber", "AuthorID", "AuthorName"])
        for k, i in enumerate(img_info_lst[:img_nbr]):
            self.rtn_basepages.append(dl.BASEPAGE_URL(i[3]))        # url request header use
            image_info_table.add_row([(k + 1), i[3], i[1], dl.FROM_URL_GET_IMG_NAME(self.rtn_target_urls[k]), i[4], i[2]])

        # damn emoji, maybe dump failed
        try:
            self.wkv_cw_api.wca_logprowork(self.logpath, str(image_info_table), False)
        except Exception as e:
            dl.LT_PRINT(dl.BR_CB('error: %s, dump prettytable interrupt' % str(e)))

        return dl.PUB_E_OK
Пример #6
0
        def create(self):
            """Create a new thread

            Use built-in queue to manage threads list
            :return:    status flag
            """
            self.lock_t.acquire()
            self.queue_t.append(self)
            self.lock_t.release()
            # if the system has insufficient memory
            # it will not be able to create more threads
            # this step will fail
            try:
                self.start()
            except Exception as e:
                log_content = dl.BR_CB("Error type: " + str(e))
                WkvCwApi.wca_logprowork(log_content, self.logpath)
                return dl.PUB_E_FAIL
            return dl.PUB_E_OK
Пример #7
0
        def run(self):
            """Overwrite threading.thread run() method

            :return:    none
            """
            try:
                # package download one image thread
                # default set server mode here(actually it doesn’t matter)
                WkvCwApi(2)._save_oneimage(self.index, self.url, 
                    self.basepages, self.workdir, self.logpath)
            except Exception as e:
                log_content = dl.BR_CB("Error type: " + str(e))
                WkvCwApi.wca_logprowork(log_content, self.logpath)

            self.lock_t.acquire()           # thread queue adjust, lock it
            self.queue_t.remove(self)       # remove end thread from list
            if len(self.queue_t) == dl.SYSTEM_MAX_THREADS - 1:
                self.event_t.set()
                self.event_t.clear()
            self.lock_t.release()
Пример #8
0
    def wca_download_alltarget(self, logpath, urls, basepages, workdir):
        """Multi-process download all image

        @@API that allows external calls
        :param urls:        all original images urls
        :param basepages:   all referer basic pages
        :param workdir:     work directory
        :param logpath:     log save path
        :return:            none
        """
        thread_block_flag = False  # thread blocking flag
        alive_thread_cnt = queueLength = len(urls)
        log_content = dl.BY_CB('hit %d target(s), start download task(s)' %
                               queueLength)
        self.wca_logprowork(logpath, log_content)

        # capture timeout and the user interrupt fault and exit the failed thread
        try:
            for i, one_url in enumerate(urls):
                self._MultiThreading.lock_t.acquire()
                if len(self._MultiThreading.queue_t) > dl.SYSTEM_MAX_THREADS:
                    thread_block_flag = True
                    self._MultiThreading.lock_t.release()
                    # if the number of created threads reach max limit
                    # program will stop here, wait all of threads have been created over
                    # when one thread executed over, create next one
                    self._MultiThreading.event_t.wait()
                else:
                    self._MultiThreading.lock_t.release()

                # build overwrite threading.Thread object
                sub_thread = self._MultiThreading(i, one_url, basepages,
                                                  workdir, logpath)
                # set every download sub-process daemon property
                # set false, then if you exit one thread, others threads will not end
                # set true, quit one is quit all
                sub_thread.setDaemon(True)
                # if create this sub-thread failed from function
                if sub_thread.create() == dl.PUB_E_FAIL:
                    log_content = dl.BR_CB('create a new sub-thread failed')
                    print(log_content)
                    return dl.PUB_E_FAIL

                if thread_block_flag == False:
                    log_content = dl.BY_CB(
                        'created {:d} download target object(s)')
                else:
                    log_content = dl.BY_CB(
                        'created {:d} download target object(s), thread creation is blocked, please wait'
                    )
                dl.LT_FLUSH(log_content, i + 1)
            print(dl.BY_CB(', all threads have been loaded OK'))
            thread_block_flag = False

            # parent thread wait all sub-thread end
            # the count of all threads is 1 parent thread and n sub-thread(s)
            # when all pictures have been downloaded over, thread count is 1
            while alive_thread_cnt > 1:
                # global variable update
                self.alivethread_counter = threading.active_count()
                # when alive thread count change, print its value
                if alive_thread_cnt != self.alivethread_counter:
                    alive_thread_cnt = self.alivethread_counter  # update alive thread count
                    # display alive sub-thread count
                    # its number wouldn't more than thread max count
                    log_content = dl.BY_CB(
                        'currently remaining sub-thread(s):({:4d}/{:4d}), completed:({:4.1%})|({:5.2f}MB)'
                    )
                    dl.LT_FLUSH(log_content, alive_thread_cnt - 1, queueLength,
                                ((queueLength -
                                  (alive_thread_cnt - 1)) / queueLength),
                                (float(WkvCwApi._datastream_pool / 1024)))
            print(dl.BY_CB(', sub-threads execute finished'))
        except KeyboardInterrupt:
            print(dl.BY_CB(', user interrupt a thread, exit all threads'))
Пример #9
0
    def rtn_target_confirm(self):
        """Input option and confirm target

        :return:        status code
        """
        req_url     = None      # request target ranking url
        rank_word   = None      # ranking word
        dwm_opt     = None      # daily/weekly/monthly

        if self.ir_mode == dl.MODE_INTERACTIVE:
            page_opt    = dl.LT_INPUT(dl.HL_CY('select ranking type, ordinary(1) | r18(2) | r18g(3): '))
            sex_opt     = dl.LT_INPUT(dl.HL_CY('select sex favor, normal(0) | male(1) | female(2): '))
        elif self.ir_mode == dl.MODE_SERVER:
            page_opt    = self.rtn_r18_arg
            sex_opt     = self.rtn_sex_opt
        else:
            dl.nolog_raise_arguerr()
            return dl.PUB_E_PARAM_FAIL

        if page_opt == dl.PAGE_ORDINARY:
            if self.ir_mode == dl.MODE_INTERACTIVE:
                dwm_opt = dl.LT_INPUT(dl.HL_CY('select daily(1) | weekly(2) | monthly(3) ordinary ranking type: '))
            elif self.ir_mode == dl.MODE_SERVER:
                dwm_opt = self.rtn_rank_type
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL

            if dwm_opt == dl.RANK_DAILY:
                if sex_opt == dl.SEX_NORMAL:
                    req_url     = dl.RANK_DAILY_URL
                    rank_word   = dl.DAILY_WORD
                elif sex_opt == dl.SEX_MALE:
                    req_url     = dl.RANK_DAILY_MALE_URL
                    rank_word   = dl.MALE_WORD
                elif sex_opt == dl.SEX_FEMALE:
                    req_url     = dl.RANK_DAILY_FEMALE_URL
                    rank_word   = dl.FEMALE_WORD
                else:
                    dl.nolog_raise_arguerr()
                    return dl.PUB_E_PARAM_FAIL
            elif dwm_opt == dl.RANK_WEEKLY:
                req_url     = dl.RANK_WEEKLY_URL
                rank_word   = dl.WEEKLY_WORD
            elif dwm_opt == dl.RANK_MONTHLY:
                req_url     = dl.RANK_MONTHLY_URL
                rank_word   = dl.MONTHLY_WORD
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL

        elif page_opt == dl.PAGE_R18:
            if self.ir_mode == dl.MODE_INTERACTIVE:
                dwm_opt = dl.LT_INPUT(dl.HL_CY('select daily(1)/weekly(2) R18 ranking type: '))
            elif self.ir_mode == dl.MODE_SERVER:
                dwm_opt = self.rtn_rank_type
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL
            if dwm_opt == dl.RANK_DAILY:
                if sex_opt == dl.SEX_NORMAL:
                    req_url     = dl.RANK_DAILY_R18_URL
                    rank_word   = dl.DAILY_WORD
                elif sex_opt == dl.SEX_MALE:
                    req_url     = dl.RANK_DAILY_MALE_R18_URL
                    rank_word   = dl.MALE_WORD
                elif sex_opt == dl.SEX_FEMALE:
                    req_url     = dl.RANK_DAILY_FEMALE_R18_URL
                    rank_word   = dl.FEMALE_WORD
                else:
                    dl.nolog_raise_arguerr()
                    return dl.PUB_E_PARAM_FAIL
            elif dwm_opt == dl.RANK_WEEKLY:
                req_url     = dl.RANK_WEEKLY_R18_URL
                rank_word   = dl.WEEKLY_WORD
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL

        elif page_opt == dl.PAGE_R18G:
            req_url     = dl.RANK_R18G_URL
            rank_word   = dl.R18G_WORD
            dl.LT_PRINT(dl.BR_CB('warning: you choose the r18g rank, hope you know what it means'))

        else:
            dl.nolog_raise_arguerr()
            return dl.PUB_E_PARAM_FAIL

        log_content = dl.BY_CB('base select option, set rank target url: [%s]' % req_url)
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
        self.rtn_req_url    = req_url
        self.page_opt       = page_opt

        return dl.PUB_E_OK
Пример #10
0
    def ira_crawl_allpage_target(self):
        """Package all gather urls

        :return:            status code
        """
        require_page_cnt = 0

        if self.ira_max_cnt <= dl.ONE_PAGE_COMMIT:
            require_page_cnt = 1
        else:
            require_page_cnt = int(self.ira_max_cnt / dl.ONE_PAGE_COMMIT)
            # remainder decision
            if self.ira_max_cnt % dl.ONE_PAGE_COMMIT != 0:
                require_page_cnt += 1

        # build the json data url
        iid_string_tail     = ''
        subpage_url_list    = []
        for ix in range(require_page_cnt):
            # one subpage only include 6*8 valid image, others are invalid
            tmp_tail_nbr = dl.ONE_PAGE_COMMIT * (ix + 1)
            tmp_tail_nbr = self.ira_max_cnt if tmp_tail_nbr > self.ira_max_cnt else tmp_tail_nbr

            for index in self.ira_pure_idlist[(dl.ONE_PAGE_COMMIT * ix):tmp_tail_nbr]:
                iid_string_tail += dl.IDS_UNIT(index)
            subpage_url_list.append(dl.ALLREPOINFO_URL(self.user_input_id, iid_string_tail, 1 if ix == 0 else 0))
            iid_string_tail = ''                            # clear last cache

        # get all data from response xhr page into a temp list
        tmp_receive_list    = []
        tmp_ret             = []
        for i in range(require_page_cnt):
            tmp_ret = self.ira_crawl_subpage_data(i + 1, subpage_url_list[i])
            if not isinstance(tmp_ret, list):
                return dl.PUB_E_FAIL
            tmp_receive_list += tmp_ret

        repo_target_all_list = []
        for i in range(len(tmp_receive_list)):
            tmp_receive_list[i][1] = dl.UNICODE_ESCAPE(tmp_receive_list[i][1])
            tmp_receive_list[i][1] = dl.EMOJI_REPLACE(tmp_receive_list[i][1])
            # build original url without image format
            tmp = tmp_receive_list[i][2]
            tmp = tmp.replace('\\', '')
            tmp_receive_list[i][2] = dl.ORIGINAL_IMAGE_HEAD + tmp[-39:-7] + '.png'  # first original url
            repo_target_all_list.append(tmp_receive_list[i])

            # add other original image url by pageCount
            tmp_page_count_str = tmp_receive_list[i][3]
            if tmp_page_count_str.isdigit():
                index_page_count = int(tmp_page_count_str)
                if index_page_count != 1:
                    for px in range(index_page_count):
                        insert_item = [tmp_receive_list[i][0], 
                                        tmp_receive_list[i][1], 
                                        tmp_receive_list[i][2][:-5] + str(px) + '.png', 
                                        tmp_receive_list[i][3]]
                        repo_target_all_list.append(insert_item)
            else:
                log_content = dl.BR_CB('page count process error')
                self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
                return dl.PUB_E_FAIL
        del tmp_receive_list

        alive_target_cnt    = len(repo_target_all_list)
        require_img_nbr     = 0

        if self.ir_mode == dl.MODE_INTERACTIVE:
            require_img_str = dl.LT_INPUT(dl.HL_CY('crawl all repo %d, whole target(s): %d, enter you want count: '
                % (self.ira_max_cnt, alive_target_cnt)))
            # if user input isn't number
            while not require_img_str.isdigit():
                dl.LT_PRINT(dl.BR_CB('input error, your input content was not a decimal number'))
                require_img_str = dl.LT_INPUT(dl.HL_CY('enter again(max is %d): ' % alive_target_cnt))
            require_img_nbr = int(require_img_str)
            if require_img_nbr <= 0:
                dl.LT_PRINT(dl.BR_CB('what the f**k is wrong with you?'))
                return dl.PUB_E_PARAM_FAIL
            require_img_nbr = alive_target_cnt if require_img_nbr > alive_target_cnt else require_img_nbr

        elif self.ir_mode == dl.MODE_SERVER:
            require_img_nbr = alive_target_cnt
            dl.LT_PRINT(dl.BY_CB('server mode auto crawl all of alive targets'))
        else:
            pass

        for k, i in enumerate(repo_target_all_list[:require_img_nbr]):
            self.ira_target_capture.append(i[2])
            self.ira_basepages.append(dl.BASEPAGE_URL(i[0]))

        log_content = 'illustrator [%s] id [%s], require image(s): %d, target table:' \
            % (self.ira_author_name, self.user_input_id, require_img_nbr)
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)

        image_info_table = PrettyTable(["ImageNumber", "ImageID", "ImageTitle", "ImagePageName"])
        for k, i in enumerate(repo_target_all_list[:require_img_nbr]):
            image_info_table.add_row([(k + 1), i[0], i[1], dl.FROM_URL_GET_IMG_NAME(i[2])])

        # damn emoji, maybe dump failed
        try:
            self.wkv_cw_api.wca_logprowork(self.logpath, str(image_info_table), False)
        except Exception as e:
            dl.LT_PRINT(dl.BR_CB('error: %s, dump prettytable interrupt' % str(e)))
        del repo_target_all_list

        return dl.PUB_E_OK
Пример #11
0
    def ira_crawl_subpage_data(self, index, index_url):
        """Crawl a subpage all data

        :param index:       request page index
        :param index_url:   index group url
        :return:            one page get info list(2-d)
        """
        response = self.wkv_cw_api.wca_url_request_handler(target_url=index_url,
                                                        post_data=self.wkv_cw_api.getway_data, 
                                                        timeout=30, 
                                                        target_page_word='subpage %d' % index,
                                                        logpath=self.logpath)

        # 20181002 event effect: cannot get web source, this web_src is server return raw json data
        web_src = response.read().decode("UTF-8", "ignore")
        ## self.wkv_cw_api.wca_save_test_html('all-repo', 'E:\\OperationCache', web_src)

        error_status_pattern    = re.compile(dl.PAGE_REQUEST_SYM_REGEX, re.S)
        error_status_list       = re.findall(error_status_pattern, web_src)
        if not error_status_list:
            log_content = dl.BR_CB('regex get error status failed')
            self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
            return dl.PUB_E_REGEX_FAIL
        error_status = error_status_list[0]
        if error_status == 'true':
            log_content = dl.BR_CB('subpage %d response failed' % index)
            self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
            return dl.PUB_E_RESPONSE_FAIL

        # crawl one page items info
        page_target_pattern = re.compile(dl.PAGE_TGT_INFO_SQUARE_REGEX, re.S)
        page_tgt_info_tpe   = re.findall(page_target_pattern, web_src)
        if not page_tgt_info_tpe:
            log_content = dl.BR_CB('regex get target page info failed')
            return dl.PUB_E_REGEX_FAIL

        # tuple transform to list
        tmp_target_info_list = []
        for i in range(len(page_tgt_info_tpe)):
            tmp_target_info_list.append([])
            for j in range(len(page_tgt_info_tpe[i])):
                tmp_target_info_list[i] = list(page_tgt_info_tpe[i])
        del page_tgt_info_tpe

        # check artwork type, delete gif
        tgt_info_comp_works = []
        illust_type_pattern = re.compile(dl.ILLUST_TYPE_REGEX, re.S)
        for k in range(len(tmp_target_info_list)):
            illust_type_sym = re.findall(illust_type_pattern, tmp_target_info_list[k][2])
            if len(illust_type_sym) == 0:
                log_content = dl.BR_CB('illust type process error')
                self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
                return dl.PUB_E_FAIL

            if illust_type_sym[0] == dl.GIF_TYPE_LABEL:
                continue

            # delete unuseful data
            del tmp_target_info_list[k][2]
            del tmp_target_info_list[k][-2]

            tgt_info_comp_works.append(tmp_target_info_list[k])
        del tmp_target_info_list

        return tgt_info_comp_works
Пример #12
0
    def ira_gather_preloadinfo(self):
        """Crawler need to know how many images do you want

        This function will get author name base on author id
        :return:            status code
        """
        # request all of one illustrator's artworks
        response = self.wkv_cw_api.wca_url_request_handler(target_url=dl.AJAX_ALL_URL(self.user_input_id),
                                                        post_data=self.wkv_cw_api.getway_data, 
                                                        timeout=30, 
                                                        target_page_word='ajaxpage',
                                                        logpath=self.logpath)

        # get artworks id list
        web_src = response.read().decode("UTF-8", "ignore")
        ajax_idlist_pattern = re.compile(dl.AJAX_ALL_IDLIST_REGEX, re.S)
        ajax_idlist         = re.findall(ajax_idlist_pattern, web_src)
        if not ajax_idlist:
            log_content = dl.BR_CB('regex get ajax id list fail')
            self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
            return dl.PUB_E_REGEX_FAIL

        number_pattern = re.compile(dl.NUMBER_REGEX, re.S)
        for index in ajax_idlist:
            if index.isdigit():
                self.ira_pure_idlist.append(index)
            else:
                # id list result may include some garbages, use number regex get pure result
                one_pure_id = re.findall(number_pattern, index)
                if one_pure_id:
                    self.ira_pure_idlist.append(one_pure_id[0])
                else:
                    pass

        # website server require the descending list of sort artwork id
        pure_idlist_nbr = []
        for index in self.ira_pure_idlist:
            pure_idlist_nbr.append(int(index))
        self.wkv_cw_api.wca_quick_sort(pure_idlist_nbr, 0, len(pure_idlist_nbr) - 1)

        self.ira_pure_idlist.clear()
        for index in reversed(pure_idlist_nbr):
            self.ira_pure_idlist.append(str(index))
        del pure_idlist_nbr
        self.ira_max_cnt = len(self.ira_pure_idlist)

        # get author name from member-main-page
        illust_mainpage_url = dl.USERS_ARTWORKS_URL(self.user_input_id)
        log_content = dl.HL_CY('crawl illustrator url: [%s]' % illust_mainpage_url)
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)

        response = self.wkv_cw_api.wca_url_request_handler(target_url=illust_mainpage_url,
                                                        post_data=self.wkv_cw_api.getway_data, 
                                                        timeout=30, 
                                                        target_page_word='mainpage',
                                                        logpath=self.logpath)

        # match illustrator name
        web_src = response.read().decode("UTF-8", "ignore")
        illust_name_pattern = re.compile(dl.ILLUST_NAME_REGEX(self.user_input_id), re.S)
        author_info         = re.findall(illust_name_pattern, web_src)
        if not author_info:
            # cannot catch illust name in mainpage if login failed
            dl.LT_PRINT(dl.BR_CB("Regex parsing result error, no author info"))
            return dl.PUB_E_REGEX_FAIL

        self.ira_author_name = author_info[0]
        log_content = dl.HL_CY('check illustrator: [%s]' % self.ira_author_name)
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)

        return dl.PUB_E_OK