Пример #1
0
    def target_confirm(self):
        """Input option and confirm target

        :return:            request mainpage url, mode
        """

        rank_word, req_url = None, None

        if self.ir_mode == 1:
            log_context = 'Gather ranking list======>'
            self.pvmx.logprowork(self.logpath, log_context)

            ormode = dataload.logtime_input(
                'Select ranking type, ordinary(o|1) or r18(r|2): ')
        elif self.ir_mode == 2:
            ormode = self.rtn_r18_arg

        if ormode == 'o' or ormode == '1':
            if self.ir_mode == 1:
                dwm = dataload.logtime_input(
                    'Select daily(1) | weekly(2) | monthly(3) ordinary ranking type: ')
            elif self.ir_mode == 2:
                dwm = self.rtn_rank_type

            if dwm == '1':
                req_url = dataload.DAILY_RANKING_URL
                rank_word = dataload.DAILY_WORD
            elif dwm == '2':
                req_url = dataload.WEEKLY_RANKING_URL
                rank_word = dataload.WEEKLY_WORD
            elif dwm == '3':
                req_url = dataload.MONTHLY_RANKING_URL
                rank_word = dataload.MONTHLY_WORD
            else:
                dataload.logtime_print("Argument(s) error\n")
            log_context = 'Crawler set target to %s rank top' % rank_word
        elif ormode == 'r' or ormode == '2':
            if self.ir_mode == 1:
                dwm = dataload.logtime_input(
                    'Select daily(1)/weekly(2) R18 ranking type: ')
            elif self.ir_mode == 2:
                dwm = self.rtn_rank_type

            if dwm == '1':
                req_url = dataload.DAILY_RANKING_R18_URL
                rank_word = dataload.DAILY_WORD
            elif dwm == '2':
                req_url = dataload.WEEKLY_RANKING_R18_URL
                rank_word = dataload.WEEKLY_WORD
            else:
                dataload.logtime_print(
                    "Argument(s) error\n")
            log_context = 'Crawler set target to %s r18 rank top' % rank_word
        else:
            dataload.logtime_print("Argument(s) error\n")
            log_context = None
        self.pvmx.logprowork(self.logpath, log_context)

        return req_url, ormode
Пример #2
0
def main():
    """main() function

    Get user input arguments and launch mode function
    :return:    none
    """
    print(PixivAPILib.__doc__)
    # program work continue ask
    ask_res = dataload.logtime_input('%s lanuch, continue? (Y/N): ' %
                                     dataload.PROJECT_NAME)
    if ask_res == 'N' or ask_res == 'No' or ask_res == 'n':
        dataload.logtime_print("User exit program\n")
        exit(0)
    # website id and password require
    ask_res = dataload.logtime_input(
        'Crawler will use your Pixiv-ID and password to login to the website, agree? (Y/N): '
    )
    if ask_res == 'N' or ask_res == 'No' or ask_res == 'n':
        dataload.logtime_print("No ID and password crawler cannot work, exit")
        exit(0)

    api_instance = PixivAPILib()  # instance class to a object
    api_instance.camouflage_login()  # crawler simulated login
    # multiple task cycles
    while True:
        mode = dataload.logtime_input('Login finished, select mode: ')
        # ranking top N mode
        if mode == 'rtn' or mode == '1':
            dataload.logtime_print('Mode: [Ranking Top N]')
            rtn_instance = rtn(dataload.RANK_DIR, dataload.LOG_PATH,
                               dataload.HTML_PATH, api_instance)
            rtn_instance.start()
        # illustrator repositories all mode
        elif mode == 'ira' or mode == '2':
            dataload.logtime_print('Mode: [Illustrator Repository All]')
            ira_instance = ira(dataload.REPO_DIR, dataload.LOG_NAME,
                               dataload.HTML_NAME, api_instance)
            ira_instance.start()
        # help page
        elif mode == 'help' or mode == '3':
            print(PixivAPILib.__doc__)
        # user normal exit program
        elif mode == 'exit' or mode == '4':
            dataload.logtime_print("User exit program")
            dataload.crawler_logo()  # exit print logo
            exit(0)
        # input parameter error, into next circle
        else:
            dataload.logtime_print("Argument(s) error")
Пример #3
0
 def __init__(self, workdir, log_name, html_name, pvmx, ir_mode, ext_id=''):
     """
     :param workdir:     work directory
     :param log_name:    log name
     :param html_name:   html name
     :param pvmx:        API library class instance
     :param ir_mode:     interactive mode or server mode
     :param ext_id:      external illustrator id
     """
     if ir_mode == 1:
         target_id = dataload.logtime_input(
                     'Target crawl illustrator pixiv-id: ')
     elif ir_mode == 2:
         target_id = ext_id
     self.user_input_id = target_id
     self.workdir = workdir + 'illustrepo_' + self.user_input_id
     self.logpath = self.workdir + log_name
     self.htmlpath = self.workdir + html_name
     self.pvmx = pvmx
     self.ir_mode = ir_mode
     # class inside call global variable
     self.author_name = None
     self.max_cnt = 0
     self.pure_idlist = []
     self.target_capture = []
     self.basepages = []
Пример #4
0
    def gather_essential_info(ormode, whole_nbr):
        """Get input image count

        If user input number more than whole number, set target count is whole number
        Only intercative mode call this function
        :param ormode:      select ranktop ordinary or r18 mode
        :param whole_nbr:   whole ranking crawl count
        :return:            crawl images count
        """
        # transfer ascii string to number
        img_cnt = 0
        # choose ordinary artwork images
        if ormode == 'o' or ormode == '1':
            # input a string for request image number
            img_str = dataload.logtime_input(
                'Gather whole ordinary valid target %d, enter you want: '
                % whole_nbr)
        # choose R18 artwork images
        elif ormode == 'r' or ormode == '2':
            # input a string for request image number
            img_str = dataload.logtime_input(
                'Gather whole R18 vaild target %d, enter you want: '
                % whole_nbr)
        # error input
        else:
            dataload.logtime_print("Argument(s) error\n")
            exit(-1)

        # if user input isn't number
        while not img_str.isdigit():
            dataload.logtime_print(
                'Input error, your input content was not a decimal number')
            img_str = dataload.logtime_input(
                'Enter again(max is %d): ' % whole_nbr)
        # check input content is a number
        # if user input number more than limit max, set it to max
        img_cnt = int(img_str)
        if img_cnt > whole_nbr:
            img_cnt = whole_nbr
        elif img_cnt <= 0:
            dataload.logtime_print('What the f**k is wrong with you?')
            exit(-1)

        return img_cnt
Пример #5
0
 def __init__(self, workdir, log_name, html_name, pvmx):
     """
     :param workdir:     work directory
     :param log_name:    log name
     :param html_name:   html name
     :param pvmx:        API library class instance
     """
     target_id = dataload.logtime_input(
         'Target crawl illustrator pixiv-id: ')
     self.user_input_id = target_id
     self.workdir = workdir + 'illustrepo_' + self.user_input_id
     self.logpath = self.workdir + log_name
     self.htmlpath = self.workdir + html_name
     self.pvmx = pvmx
     # class inside call global variable
     self.author_name = None
     self.max_cnt = 0
     self.pure_idlist = []
     self.target_capture = []
     self.basepages = []
Пример #6
0
    def crawl_allpage_target(self):
        """Package all gather urls

        :return:            none
        """
        # calcus nbr need request count
        # each page at most ONE_AUTHOR_MAINPAGE_IMGCOUNT(20181003:48) images
        require_page_cnt = 0
        if self.max_cnt <= dataload.ONE_PAGE_COMMIT:
            require_page_cnt = 1
        else:
            require_page_cnt = int(self.max_cnt / dataload.ONE_PAGE_COMMIT)
            # remainder decision
            if self.max_cnt % dataload.ONE_PAGE_COMMIT != 0:
                require_page_cnt += 1

        # build request url of one page 
        iid_string_tail = ''
        page_url_array = []
        for ix in range(require_page_cnt):
            # tail number limit
            tmp_tail_nbr = dataload.ONE_PAGE_COMMIT * (ix + 1)
            if tmp_tail_nbr > self.max_cnt:
                tmp_tail_nbr = self.max_cnt
            for index in self.pure_idlist[(dataload.ONE_PAGE_COMMIT * ix):tmp_tail_nbr]:
                iid_string_tail += dataload.IDS_UNIT(index)
            one_page_request_url = dataload.ALLREPOINFO_URL(self.user_input_id, iid_string_tail)
            iid_string_tail = ''                                # clear last cache
            page_url_array.append(one_page_request_url)
        
        # gather all data from response xhr page into a temp list
        tmp_receive_list = []
        for i in range(require_page_cnt):
            tmp_receive_list += self.crawl_onepage_data(i + 1, page_url_array[i])
        # handle url string
        repo_target_all_list = []
        for i in range(len(tmp_receive_list)):
            # tasnform title '\\uxxx' to unicode
            tmp_receive_list[i][1] = self.pvmx.unicode_escape(tmp_receive_list[i][1])
            # replace emoji string
            tmp_receive_list[i][1] = self.pvmx.replace_emoji(tmp_receive_list[i][1])
            # build original url without image format
            tmp = tmp_receive_list[i][2]
            tmp = tmp.replace('\\', '')                         # delete character '\' 
            tmp_receive_list[i][2] = dataload.ORIGINAL_IMAGE_HEAD + tmp[50:] + '.png'
            repo_target_all_list.append(tmp_receive_list[i])    # move original item to target list
            # use page count number build total url
            tmp_page_count_str = tmp_receive_list[i][3]
            if tmp_page_count_str.isdigit():
                index_page_count = int(tmp_page_count_str)
                if index_page_count != 1:
                    # add others items into list
                    for px in range(index_page_count - 1):
                        insert_item = [tmp_receive_list[i][0], 
                            tmp_receive_list[i][1], 
                            tmp_receive_list[i][2][:-5] + str(px + 1) + '.png', 
                            tmp_receive_list[i][3]]
                        repo_target_all_list.append(insert_item)
            else:
                log_context = 'Page count process error!'
                self.pvmx.logprowork(self.logpath, log_context)
                exit(-1)
        del tmp_receive_list                                    # clear cache

        # collection target count
        alive_targetcnt = len(repo_target_all_list)
        require_img_nbr = 0
        if self.ir_mode == 1:
            require_img_str = dataload.logtime_input(
                'Gather all repo %d, whole target(s): %d, enter you want count: '
                        % (self.max_cnt, alive_targetcnt))
            # if user input isn't number
            while not require_img_str.isdigit():
                dataload.logtime_print(
                    'Input error, your input content was not a decimal number')
                require_img_str = dataload.logtime_input(
                    'Enter again(max is %d): ' % alive_targetcnt)
            require_img_nbr = int(require_img_str)
            # if user input number more than limit max, set it to max
            if require_img_nbr > alive_targetcnt:
                require_img_nbr = alive_targetcnt
            elif require_img_nbr <= 0:
                dataload.logtime_print('What the f**k is wrong with you?')
                exit(-1)
        # server mode directly catch all of alive targets
        elif self.ir_mode == 2:
            require_img_nbr = alive_targetcnt
            dataload.logtime_print('Server mode auto crawl all of alive targets')
        
        # download image number limit
        for k, i in enumerate(repo_target_all_list[:require_img_nbr]):
            self.target_capture.append(i[2])    # put url into target capture list
            self.basepages.append(dataload.BASEPAGE_URL + i[0]) # build basepage url
            
        # display author info
        log_context = ('Illustrator: ' + self.author_name + ' id: '
            + self.user_input_id + ' require image(s): ' 
            + str(require_img_nbr) + ', target table:')
        self.pvmx.logprowork(self.logpath, log_context)
        # use prettytable build a table save and print info list
        image_info_table = PrettyTable(
            ["ImageNumber", "ImageID", "ImageTitle", "ImagePageName"])
        for k, i in enumerate(repo_target_all_list[:require_img_nbr]):
            image_info_table.add_row([(k + 1), i[0], i[1], i[2][57:-4]]) 
        # save with str format and no time word
        self.pvmx.logprowork(self.logpath, str(image_info_table), 'N')
        del repo_target_all_list            # clear cache 
    def _login_preload(aes_file_path):
        """Get user input login info and storage into aes file

        If project directory has no file, you need hand-input login info,
        then program will create new file to storage AES encrypt info to it
        This method use pycrypto, need import external call
        :param aes_file_path:       .aes_crypto_login.ini file path
        :return:                    username, password, get data
        """
        is_aes_file_existed = os.path.exists(aes_file_path)
        if is_aes_file_existed:
            # stable read rows get username and password
            # read bin file content to a list
            read_aes_file = open(aes_file_path, 'rb+')
            readline_cache = read_aes_file.readlines()  # all line list
            read_aes_file.close()

            read_aes_iv_param_raw = readline_cache[0]  # row 1 is AES IV PARAM
            read_user_mailbox_raw = readline_cache[1]  # row 2 is username
            read_user_passwd_raw = readline_cache[2]  # row 3 is password
            # cut last char (b'\n')
            read_aes_iv_param_raw = read_aes_iv_param_raw[:-1]
            read_user_mailbox_raw = read_user_mailbox_raw[:-1]
            read_user_passwd_raw = read_user_passwd_raw[:-1]

            # analysis hash value to string
            username_aes_decrypt_cipher = AES.new(dataload.AES_SECRET_KEY,
                                                  AES.MODE_CFB,
                                                  read_aes_iv_param_raw)
            username = str(
                username_aes_decrypt_cipher.decrypt(
                    read_user_mailbox_raw[AES.block_size:]), 'UTF-8')
            password_aes_decrypt_cipher = AES.new(dataload.AES_SECRET_KEY,
                                                  AES.MODE_CFB,
                                                  read_aes_iv_param_raw)
            passwd = str(
                password_aes_decrypt_cipher.decrypt(
                    read_user_passwd_raw[AES.block_size:]), 'UTF-8')

            # check username and password
            check = dataload.logtime_input(
                "Read user login information configuration ok, check this: \n"
                "[-> Username] %s\n[-> Password] %s\n"
                "Is that correct? (Y/N): " % (username, passwd))

            # if user judge info are error, delete old AES file and record new info
            if check == 'N' or check == 'n':
                os.remove(aes_file_path)  # delete old AES file
                # temporarily enter login information
                dataload.logtime_print(
                    "Well, you need hand-input your login data: ")
                username = dataload.logtime_input(
                    'Enter your pixiv id(mailbox), must be a R18: ').encode(
                        'utf-8')
                passwd = getpass.getpass(
                    dataload.realtime_logword(dataload.base_time) +
                    'Enter your account password: '******'utf-8')

                generate_aes_iv_param = Random.new().read(
                    AES.block_size)  # generate random aes iv param
                username_cipher = AES.new(dataload.AES_SECRET_KEY,
                                          AES.MODE_CFB, generate_aes_iv_param)
                username_encrypto = generate_aes_iv_param + username_cipher.encrypt(
                    username)
                passwd_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB,
                                        generate_aes_iv_param)
                passwd_encrypto = generate_aes_iv_param + passwd_cipher.encrypt(
                    passwd)

                # create new aes file rewrite it
                write_aes_file = open(aes_file_path, 'wb')
                # write bin value to file with b'\n' to wrap
                write_aes_file.write(generate_aes_iv_param +
                                     b'\n')  # row 1 is iv param
                write_aes_file.write(username_encrypto +
                                     b'\n')  # row 2 is username
                write_aes_file.write(passwd_encrypto +
                                     b'\n')  # row 3 is password
                write_aes_file.close()
            # read info correct, jump out here
            else:
                pass

        # if no AES file, then create new and write md5 value into it
        else:
            dataload.logtime_print(
                "Create new AES encrypt file to storage your username and password: "
            )
            username = dataload.logtime_input(
                'Enter your pixiv id(mailbox), must be a R18: ').encode(
                    'utf-8')
            passwd = getpass.getpass(
                dataload.realtime_logword(dataload.base_time) +
                'Enter your account password: '******'utf-8')

            generate_aes_iv_param = Random.new().read(
                AES.block_size)  # generate random aes iv param
            username_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB,
                                      generate_aes_iv_param)
            username_encrypto = generate_aes_iv_param + username_cipher.encrypt(
                username)
            passwd_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB,
                                    generate_aes_iv_param)
            passwd_encrypto = generate_aes_iv_param + passwd_cipher.encrypt(
                passwd)

            # create new AES file, set write bin bytes mode
            write_aes_file = open(aes_file_path, 'wb')
            # write bin value to file with b'\n' to wrap
            write_aes_file.write(generate_aes_iv_param +
                                 b'\n')  # row 1 is iv param
            write_aes_file.write(username_encrypto +
                                 b'\n')  # row 2 is username
            write_aes_file.write(passwd_encrypto + b'\n')  # row 3 is password
            write_aes_file.close()

        # build data string
        getway_register = [('user', username), ('pass', passwd)]
        getway_data = urllib.parse.urlencode(getway_register).encode(
            encoding='UTF8')

        return username, passwd, getway_data  # return login use 3 elements
Пример #8
0
    def target_confirm(self):
        """Input option and confirm target

        :return:            request mainpage url, mode
        """

        rank_word, req_url = None, None

        if self.ir_mode == 1:
            log_context = 'Gather ranking list======>'
            self.pvmx.logprowork(self.logpath, log_context)

            # select rank R18 or not
            ormode = dataload.logtime_input(
                'Select ranking type, ordinary(o|1) or r18(r|2): ')
            mf_word = dataload.logtime_input(
                'Select sex favor, normal(n|0) or male(m|1) or female(f|2): ')
        elif self.ir_mode == 2:
            ormode = self.rtn_r18_arg
            mf_word = self.rtn_mf_word

        if ormode == 'o' or ormode == '1':
            if self.ir_mode == 1:
                dwm = dataload.logtime_input(
                    'Select daily(1) | weekly(2) | monthly(3) ordinary ranking type: '
                )
            elif self.ir_mode == 2:
                dwm = self.rtn_rank_type

            if dwm == '1':
                if mf_word == '0' or mf_word == 'n':
                    req_url = dataload.DAILY_RANKING_URL
                    rank_word = dataload.DAILY_WORD
                # choose the mail or female, rank type only can be set to daily
                elif mf_word == '1' or mf_word == 'm':
                    req_url = dataload.DAILY_MALE_RANKING_URL
                    rank_word = dataload.MALE_WORD
                elif mf_word == '2' or mf_word == 'f':
                    req_url = dataload.DAILY_FEMALE_RANKING_URL
                    rank_word = dataload.FEMALE_WORD
                else:
                    dataload.logtime_print("Argument(s) error\n")
            elif dwm == '2':
                req_url = dataload.WEEKLY_RANKING_URL
                rank_word = dataload.WEEKLY_WORD
            elif dwm == '3':
                req_url = dataload.MONTHLY_RANKING_URL
                rank_word = dataload.MONTHLY_WORD
            else:
                dataload.logtime_print("Argument(s) error\n")
            log_context = 'Crawler set target to %s rank top' % rank_word
        elif ormode == 'r' or ormode == '2':
            if self.ir_mode == 1:
                dwm = dataload.logtime_input(
                    'Select daily(1)/weekly(2) R18 ranking type: ')
            elif self.ir_mode == 2:
                dwm = self.rtn_rank_type

            if dwm == '1':
                if mf_word == '0' or mf_word == 'n':
                    req_url = dataload.DAILY_RANKING_R18_URL
                    rank_word = dataload.DAILY_WORD
                # choose the mail or female, rank type only can be set to daily
                elif mf_word == '1' or mf_word == 'm':
                    req_url = dataload.DAILY_MALE_RANKING_R18_URL
                    rank_word = dataload.MALE_WORD
                elif mf_word == '2' or mf_word == 'f':
                    req_url = dataload.DAILY_FEMALE_RANKING_R18_URL
                    rank_word = dataload.FEMALE_WORD
                else:
                    dataload.logtime_print("Argument(s) error\n")
            elif dwm == '2':
                req_url = dataload.WEEKLY_RANKING_R18_URL
                rank_word = dataload.WEEKLY_WORD
            else:
                dataload.logtime_print("Argument(s) error\n")
            log_context = 'Crawler set target to %s r18 rank top' % rank_word
        else:
            dataload.logtime_print("Argument(s) error\n")
            log_context = None
        self.pvmx.logprowork(self.logpath, log_context)

        return req_url, ormode
Пример #9
0
def main():
    """main() function

    Get user input arguments and launch mode function
    :return:    none
    """

    print(PixivAPILib.__doc__)
    mode_interactive_server = 1                     # intercative mode or server mode, default interavtive mode(1)
    # judge the count of command line argument
    # if no external arguments, into interactive mode
    if len(sys.argv) == 1:
        mode_interactive_server = 1
        # program work continue ask
        ask_res = dataload.logtime_input('%s lanuch, continue? (Y/N): ' % dataload.PROJECT_NAME)
        if ask_res == 'N' or ask_res == 'No' or ask_res == 'n':
            dataload.logtime_print("User exit program\n")
            exit(0)
        # website id and password require
        ask_res = dataload.logtime_input(
            'Crawler will use your Pixiv-ID and password to login to the website, agree? (Y/N): ')
        if ask_res == 'N' or ask_res == 'No' or ask_res == 'n':
            dataload.logtime_print("No ID and password crawler cannot work, exit")
            exit(0)
        
        api_instance = PixivAPILib(mode_interactive_server) # instance class to a object
        api_instance.camouflage_login()                     # crawler simulated login
        # multiple task cycles
        while True:
            mode = dataload.logtime_input('Login finished, select mode: ')
            # ranking top N mode
            if mode == 'rtn' or mode == '1':
                dataload.logtime_print('Mode: [Ranking Top N]')
                rtn_instance = rtn(dataload.RANK_DIR, dataload.LOG_PATH, 
                    dataload.HTML_PATH, api_instance, mode_interactive_server)
                rtn_instance.start()
            # illustrator repositories all mode
            elif mode == 'ira' or mode == '2':
                dataload.logtime_print('Mode: [Illustrator Repository All]')
                ira_instance = ira(dataload.REPO_DIR, dataload.LOG_NAME, 
                    dataload.HTML_NAME, api_instance, mode_interactive_server)
                ira_instance.start()
            # help page
            elif mode == 'help' or mode == '3':
                print(PixivAPILib.__doc__)
            # user normal exit program
            elif mode == 'exit' or mode == '4':
                dataload.logtime_print("User exit program")
                dataload.crawler_logo()         # exit print logo
                exit(0)
            # input parameter error, into next circle
            else:
                dataload.logtime_print("Argument(s) error")
    else:
        mode_interactive_server = 2
        # argument pass to variable
        opts, args = getopt.getopt(sys.argv[1:], "hm:r:l:s:i:", ["help", "mode", "R18", "list", "sex", "id"])
        catch_mode = '1'
        rtn_r18_opt = '1'
        rtn_list_type = '1'
        rtn_mf_word = ''
        ira_illust_id = ''
        for opt, value in opts:
            if opt in ("-m", "--mode"):
                catch_mode = value
            elif opt in ("-r", "--R18"):
                rtn_r18_opt = value
            elif opt in ("-l", "--list"):
                rtn_list_type = value
            elif opt in ("-s", "--sex"):
                rtn_mf_word = value
            elif opt in ("-i", "--id"):
                ira_illust_id = value
            elif opt in ("-h", "--help"):
                print(PixivAPILib.__doc__)
                exit(0)
    
        api_instance = PixivAPILib(mode_interactive_server) # instance class to a object
        api_instance.camouflage_login()                     # crawler simulated login

        if catch_mode == '1':
            dataload.logtime_print('Mode: [Ranking Top N]')
            rtn_instance = rtn(dataload.RANK_DIR, dataload.LOG_PATH, 
                dataload.HTML_PATH, api_instance, mode_interactive_server, 
                rtn_r18_opt, rtn_list_type, rtn_mf_word)
            rtn_instance.start()
        # illustrator repositories all mode
        elif catch_mode == '2':
            dataload.logtime_print('Mode: [Illustrator Repository All]')
            ira_instance = ira(dataload.REPO_DIR, dataload.LOG_NAME, 
                dataload.HTML_NAME, api_instance, mode_interactive_server, ira_illust_id)
            ira_instance.start()
        # help page
        elif catch_mode == 'help' or catch_mode == '3':
            print(PixivAPILib.__doc__)