def target_confirm(self): """Input option and confirm target :return: request mainpage url, mode """ rank_word, req_url = None, None if self.ir_mode == 1: log_context = 'Gather ranking list======>' self.pvmx.logprowork(self.logpath, log_context) ormode = dataload.logtime_input( 'Select ranking type, ordinary(o|1) or r18(r|2): ') elif self.ir_mode == 2: ormode = self.rtn_r18_arg if ormode == 'o' or ormode == '1': if self.ir_mode == 1: dwm = dataload.logtime_input( 'Select daily(1) | weekly(2) | monthly(3) ordinary ranking type: ') elif self.ir_mode == 2: dwm = self.rtn_rank_type if dwm == '1': req_url = dataload.DAILY_RANKING_URL rank_word = dataload.DAILY_WORD elif dwm == '2': req_url = dataload.WEEKLY_RANKING_URL rank_word = dataload.WEEKLY_WORD elif dwm == '3': req_url = dataload.MONTHLY_RANKING_URL rank_word = dataload.MONTHLY_WORD else: dataload.logtime_print("Argument(s) error\n") log_context = 'Crawler set target to %s rank top' % rank_word elif ormode == 'r' or ormode == '2': if self.ir_mode == 1: dwm = dataload.logtime_input( 'Select daily(1)/weekly(2) R18 ranking type: ') elif self.ir_mode == 2: dwm = self.rtn_rank_type if dwm == '1': req_url = dataload.DAILY_RANKING_R18_URL rank_word = dataload.DAILY_WORD elif dwm == '2': req_url = dataload.WEEKLY_RANKING_R18_URL rank_word = dataload.WEEKLY_WORD else: dataload.logtime_print( "Argument(s) error\n") log_context = 'Crawler set target to %s r18 rank top' % rank_word else: dataload.logtime_print("Argument(s) error\n") log_context = None self.pvmx.logprowork(self.logpath, log_context) return req_url, ormode
def main(): """main() function Get user input arguments and launch mode function :return: none """ print(PixivAPILib.__doc__) # program work continue ask ask_res = dataload.logtime_input('%s lanuch, continue? (Y/N): ' % dataload.PROJECT_NAME) if ask_res == 'N' or ask_res == 'No' or ask_res == 'n': dataload.logtime_print("User exit program\n") exit(0) # website id and password require ask_res = dataload.logtime_input( 'Crawler will use your Pixiv-ID and password to login to the website, agree? (Y/N): ' ) if ask_res == 'N' or ask_res == 'No' or ask_res == 'n': dataload.logtime_print("No ID and password crawler cannot work, exit") exit(0) api_instance = PixivAPILib() # instance class to a object api_instance.camouflage_login() # crawler simulated login # multiple task cycles while True: mode = dataload.logtime_input('Login finished, select mode: ') # ranking top N mode if mode == 'rtn' or mode == '1': dataload.logtime_print('Mode: [Ranking Top N]') rtn_instance = rtn(dataload.RANK_DIR, dataload.LOG_PATH, dataload.HTML_PATH, api_instance) rtn_instance.start() # illustrator repositories all mode elif mode == 'ira' or mode == '2': dataload.logtime_print('Mode: [Illustrator Repository All]') ira_instance = ira(dataload.REPO_DIR, dataload.LOG_NAME, dataload.HTML_NAME, api_instance) ira_instance.start() # help page elif mode == 'help' or mode == '3': print(PixivAPILib.__doc__) # user normal exit program elif mode == 'exit' or mode == '4': dataload.logtime_print("User exit program") dataload.crawler_logo() # exit print logo exit(0) # input parameter error, into next circle else: dataload.logtime_print("Argument(s) error")
def __init__(self, workdir, log_name, html_name, pvmx, ir_mode, ext_id=''): """ :param workdir: work directory :param log_name: log name :param html_name: html name :param pvmx: API library class instance :param ir_mode: interactive mode or server mode :param ext_id: external illustrator id """ if ir_mode == 1: target_id = dataload.logtime_input( 'Target crawl illustrator pixiv-id: ') elif ir_mode == 2: target_id = ext_id self.user_input_id = target_id self.workdir = workdir + 'illustrepo_' + self.user_input_id self.logpath = self.workdir + log_name self.htmlpath = self.workdir + html_name self.pvmx = pvmx self.ir_mode = ir_mode # class inside call global variable self.author_name = None self.max_cnt = 0 self.pure_idlist = [] self.target_capture = [] self.basepages = []
def gather_essential_info(ormode, whole_nbr): """Get input image count If user input number more than whole number, set target count is whole number Only intercative mode call this function :param ormode: select ranktop ordinary or r18 mode :param whole_nbr: whole ranking crawl count :return: crawl images count """ # transfer ascii string to number img_cnt = 0 # choose ordinary artwork images if ormode == 'o' or ormode == '1': # input a string for request image number img_str = dataload.logtime_input( 'Gather whole ordinary valid target %d, enter you want: ' % whole_nbr) # choose R18 artwork images elif ormode == 'r' or ormode == '2': # input a string for request image number img_str = dataload.logtime_input( 'Gather whole R18 vaild target %d, enter you want: ' % whole_nbr) # error input else: dataload.logtime_print("Argument(s) error\n") exit(-1) # if user input isn't number while not img_str.isdigit(): dataload.logtime_print( 'Input error, your input content was not a decimal number') img_str = dataload.logtime_input( 'Enter again(max is %d): ' % whole_nbr) # check input content is a number # if user input number more than limit max, set it to max img_cnt = int(img_str) if img_cnt > whole_nbr: img_cnt = whole_nbr elif img_cnt <= 0: dataload.logtime_print('What the f**k is wrong with you?') exit(-1) return img_cnt
def __init__(self, workdir, log_name, html_name, pvmx): """ :param workdir: work directory :param log_name: log name :param html_name: html name :param pvmx: API library class instance """ target_id = dataload.logtime_input( 'Target crawl illustrator pixiv-id: ') self.user_input_id = target_id self.workdir = workdir + 'illustrepo_' + self.user_input_id self.logpath = self.workdir + log_name self.htmlpath = self.workdir + html_name self.pvmx = pvmx # class inside call global variable self.author_name = None self.max_cnt = 0 self.pure_idlist = [] self.target_capture = [] self.basepages = []
def crawl_allpage_target(self): """Package all gather urls :return: none """ # calcus nbr need request count # each page at most ONE_AUTHOR_MAINPAGE_IMGCOUNT(20181003:48) images require_page_cnt = 0 if self.max_cnt <= dataload.ONE_PAGE_COMMIT: require_page_cnt = 1 else: require_page_cnt = int(self.max_cnt / dataload.ONE_PAGE_COMMIT) # remainder decision if self.max_cnt % dataload.ONE_PAGE_COMMIT != 0: require_page_cnt += 1 # build request url of one page iid_string_tail = '' page_url_array = [] for ix in range(require_page_cnt): # tail number limit tmp_tail_nbr = dataload.ONE_PAGE_COMMIT * (ix + 1) if tmp_tail_nbr > self.max_cnt: tmp_tail_nbr = self.max_cnt for index in self.pure_idlist[(dataload.ONE_PAGE_COMMIT * ix):tmp_tail_nbr]: iid_string_tail += dataload.IDS_UNIT(index) one_page_request_url = dataload.ALLREPOINFO_URL(self.user_input_id, iid_string_tail) iid_string_tail = '' # clear last cache page_url_array.append(one_page_request_url) # gather all data from response xhr page into a temp list tmp_receive_list = [] for i in range(require_page_cnt): tmp_receive_list += self.crawl_onepage_data(i + 1, page_url_array[i]) # handle url string repo_target_all_list = [] for i in range(len(tmp_receive_list)): # tasnform title '\\uxxx' to unicode tmp_receive_list[i][1] = self.pvmx.unicode_escape(tmp_receive_list[i][1]) # replace emoji string tmp_receive_list[i][1] = self.pvmx.replace_emoji(tmp_receive_list[i][1]) # build original url without image format tmp = tmp_receive_list[i][2] tmp = tmp.replace('\\', '') # delete character '\' tmp_receive_list[i][2] = dataload.ORIGINAL_IMAGE_HEAD + tmp[50:] + '.png' repo_target_all_list.append(tmp_receive_list[i]) # move original item to target list # use page count number build total url tmp_page_count_str = tmp_receive_list[i][3] if tmp_page_count_str.isdigit(): index_page_count = int(tmp_page_count_str) if index_page_count != 1: # add others items into list for px in range(index_page_count - 1): insert_item = [tmp_receive_list[i][0], tmp_receive_list[i][1], tmp_receive_list[i][2][:-5] + str(px + 1) + '.png', tmp_receive_list[i][3]] repo_target_all_list.append(insert_item) else: log_context = 'Page count process error!' self.pvmx.logprowork(self.logpath, log_context) exit(-1) del tmp_receive_list # clear cache # collection target count alive_targetcnt = len(repo_target_all_list) require_img_nbr = 0 if self.ir_mode == 1: require_img_str = dataload.logtime_input( 'Gather all repo %d, whole target(s): %d, enter you want count: ' % (self.max_cnt, alive_targetcnt)) # if user input isn't number while not require_img_str.isdigit(): dataload.logtime_print( 'Input error, your input content was not a decimal number') require_img_str = dataload.logtime_input( 'Enter again(max is %d): ' % alive_targetcnt) require_img_nbr = int(require_img_str) # if user input number more than limit max, set it to max if require_img_nbr > alive_targetcnt: require_img_nbr = alive_targetcnt elif require_img_nbr <= 0: dataload.logtime_print('What the f**k is wrong with you?') exit(-1) # server mode directly catch all of alive targets elif self.ir_mode == 2: require_img_nbr = alive_targetcnt dataload.logtime_print('Server mode auto crawl all of alive targets') # download image number limit for k, i in enumerate(repo_target_all_list[:require_img_nbr]): self.target_capture.append(i[2]) # put url into target capture list self.basepages.append(dataload.BASEPAGE_URL + i[0]) # build basepage url # display author info log_context = ('Illustrator: ' + self.author_name + ' id: ' + self.user_input_id + ' require image(s): ' + str(require_img_nbr) + ', target table:') self.pvmx.logprowork(self.logpath, log_context) # use prettytable build a table save and print info list image_info_table = PrettyTable( ["ImageNumber", "ImageID", "ImageTitle", "ImagePageName"]) for k, i in enumerate(repo_target_all_list[:require_img_nbr]): image_info_table.add_row([(k + 1), i[0], i[1], i[2][57:-4]]) # save with str format and no time word self.pvmx.logprowork(self.logpath, str(image_info_table), 'N') del repo_target_all_list # clear cache
def _login_preload(aes_file_path): """Get user input login info and storage into aes file If project directory has no file, you need hand-input login info, then program will create new file to storage AES encrypt info to it This method use pycrypto, need import external call :param aes_file_path: .aes_crypto_login.ini file path :return: username, password, get data """ is_aes_file_existed = os.path.exists(aes_file_path) if is_aes_file_existed: # stable read rows get username and password # read bin file content to a list read_aes_file = open(aes_file_path, 'rb+') readline_cache = read_aes_file.readlines() # all line list read_aes_file.close() read_aes_iv_param_raw = readline_cache[0] # row 1 is AES IV PARAM read_user_mailbox_raw = readline_cache[1] # row 2 is username read_user_passwd_raw = readline_cache[2] # row 3 is password # cut last char (b'\n') read_aes_iv_param_raw = read_aes_iv_param_raw[:-1] read_user_mailbox_raw = read_user_mailbox_raw[:-1] read_user_passwd_raw = read_user_passwd_raw[:-1] # analysis hash value to string username_aes_decrypt_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB, read_aes_iv_param_raw) username = str( username_aes_decrypt_cipher.decrypt( read_user_mailbox_raw[AES.block_size:]), 'UTF-8') password_aes_decrypt_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB, read_aes_iv_param_raw) passwd = str( password_aes_decrypt_cipher.decrypt( read_user_passwd_raw[AES.block_size:]), 'UTF-8') # check username and password check = dataload.logtime_input( "Read user login information configuration ok, check this: \n" "[-> Username] %s\n[-> Password] %s\n" "Is that correct? (Y/N): " % (username, passwd)) # if user judge info are error, delete old AES file and record new info if check == 'N' or check == 'n': os.remove(aes_file_path) # delete old AES file # temporarily enter login information dataload.logtime_print( "Well, you need hand-input your login data: ") username = dataload.logtime_input( 'Enter your pixiv id(mailbox), must be a R18: ').encode( 'utf-8') passwd = getpass.getpass( dataload.realtime_logword(dataload.base_time) + 'Enter your account password: '******'utf-8') generate_aes_iv_param = Random.new().read( AES.block_size) # generate random aes iv param username_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB, generate_aes_iv_param) username_encrypto = generate_aes_iv_param + username_cipher.encrypt( username) passwd_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB, generate_aes_iv_param) passwd_encrypto = generate_aes_iv_param + passwd_cipher.encrypt( passwd) # create new aes file rewrite it write_aes_file = open(aes_file_path, 'wb') # write bin value to file with b'\n' to wrap write_aes_file.write(generate_aes_iv_param + b'\n') # row 1 is iv param write_aes_file.write(username_encrypto + b'\n') # row 2 is username write_aes_file.write(passwd_encrypto + b'\n') # row 3 is password write_aes_file.close() # read info correct, jump out here else: pass # if no AES file, then create new and write md5 value into it else: dataload.logtime_print( "Create new AES encrypt file to storage your username and password: " ) username = dataload.logtime_input( 'Enter your pixiv id(mailbox), must be a R18: ').encode( 'utf-8') passwd = getpass.getpass( dataload.realtime_logword(dataload.base_time) + 'Enter your account password: '******'utf-8') generate_aes_iv_param = Random.new().read( AES.block_size) # generate random aes iv param username_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB, generate_aes_iv_param) username_encrypto = generate_aes_iv_param + username_cipher.encrypt( username) passwd_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB, generate_aes_iv_param) passwd_encrypto = generate_aes_iv_param + passwd_cipher.encrypt( passwd) # create new AES file, set write bin bytes mode write_aes_file = open(aes_file_path, 'wb') # write bin value to file with b'\n' to wrap write_aes_file.write(generate_aes_iv_param + b'\n') # row 1 is iv param write_aes_file.write(username_encrypto + b'\n') # row 2 is username write_aes_file.write(passwd_encrypto + b'\n') # row 3 is password write_aes_file.close() # build data string getway_register = [('user', username), ('pass', passwd)] getway_data = urllib.parse.urlencode(getway_register).encode( encoding='UTF8') return username, passwd, getway_data # return login use 3 elements
def target_confirm(self): """Input option and confirm target :return: request mainpage url, mode """ rank_word, req_url = None, None if self.ir_mode == 1: log_context = 'Gather ranking list======>' self.pvmx.logprowork(self.logpath, log_context) # select rank R18 or not ormode = dataload.logtime_input( 'Select ranking type, ordinary(o|1) or r18(r|2): ') mf_word = dataload.logtime_input( 'Select sex favor, normal(n|0) or male(m|1) or female(f|2): ') elif self.ir_mode == 2: ormode = self.rtn_r18_arg mf_word = self.rtn_mf_word if ormode == 'o' or ormode == '1': if self.ir_mode == 1: dwm = dataload.logtime_input( 'Select daily(1) | weekly(2) | monthly(3) ordinary ranking type: ' ) elif self.ir_mode == 2: dwm = self.rtn_rank_type if dwm == '1': if mf_word == '0' or mf_word == 'n': req_url = dataload.DAILY_RANKING_URL rank_word = dataload.DAILY_WORD # choose the mail or female, rank type only can be set to daily elif mf_word == '1' or mf_word == 'm': req_url = dataload.DAILY_MALE_RANKING_URL rank_word = dataload.MALE_WORD elif mf_word == '2' or mf_word == 'f': req_url = dataload.DAILY_FEMALE_RANKING_URL rank_word = dataload.FEMALE_WORD else: dataload.logtime_print("Argument(s) error\n") elif dwm == '2': req_url = dataload.WEEKLY_RANKING_URL rank_word = dataload.WEEKLY_WORD elif dwm == '3': req_url = dataload.MONTHLY_RANKING_URL rank_word = dataload.MONTHLY_WORD else: dataload.logtime_print("Argument(s) error\n") log_context = 'Crawler set target to %s rank top' % rank_word elif ormode == 'r' or ormode == '2': if self.ir_mode == 1: dwm = dataload.logtime_input( 'Select daily(1)/weekly(2) R18 ranking type: ') elif self.ir_mode == 2: dwm = self.rtn_rank_type if dwm == '1': if mf_word == '0' or mf_word == 'n': req_url = dataload.DAILY_RANKING_R18_URL rank_word = dataload.DAILY_WORD # choose the mail or female, rank type only can be set to daily elif mf_word == '1' or mf_word == 'm': req_url = dataload.DAILY_MALE_RANKING_R18_URL rank_word = dataload.MALE_WORD elif mf_word == '2' or mf_word == 'f': req_url = dataload.DAILY_FEMALE_RANKING_R18_URL rank_word = dataload.FEMALE_WORD else: dataload.logtime_print("Argument(s) error\n") elif dwm == '2': req_url = dataload.WEEKLY_RANKING_R18_URL rank_word = dataload.WEEKLY_WORD else: dataload.logtime_print("Argument(s) error\n") log_context = 'Crawler set target to %s r18 rank top' % rank_word else: dataload.logtime_print("Argument(s) error\n") log_context = None self.pvmx.logprowork(self.logpath, log_context) return req_url, ormode
def main(): """main() function Get user input arguments and launch mode function :return: none """ print(PixivAPILib.__doc__) mode_interactive_server = 1 # intercative mode or server mode, default interavtive mode(1) # judge the count of command line argument # if no external arguments, into interactive mode if len(sys.argv) == 1: mode_interactive_server = 1 # program work continue ask ask_res = dataload.logtime_input('%s lanuch, continue? (Y/N): ' % dataload.PROJECT_NAME) if ask_res == 'N' or ask_res == 'No' or ask_res == 'n': dataload.logtime_print("User exit program\n") exit(0) # website id and password require ask_res = dataload.logtime_input( 'Crawler will use your Pixiv-ID and password to login to the website, agree? (Y/N): ') if ask_res == 'N' or ask_res == 'No' or ask_res == 'n': dataload.logtime_print("No ID and password crawler cannot work, exit") exit(0) api_instance = PixivAPILib(mode_interactive_server) # instance class to a object api_instance.camouflage_login() # crawler simulated login # multiple task cycles while True: mode = dataload.logtime_input('Login finished, select mode: ') # ranking top N mode if mode == 'rtn' or mode == '1': dataload.logtime_print('Mode: [Ranking Top N]') rtn_instance = rtn(dataload.RANK_DIR, dataload.LOG_PATH, dataload.HTML_PATH, api_instance, mode_interactive_server) rtn_instance.start() # illustrator repositories all mode elif mode == 'ira' or mode == '2': dataload.logtime_print('Mode: [Illustrator Repository All]') ira_instance = ira(dataload.REPO_DIR, dataload.LOG_NAME, dataload.HTML_NAME, api_instance, mode_interactive_server) ira_instance.start() # help page elif mode == 'help' or mode == '3': print(PixivAPILib.__doc__) # user normal exit program elif mode == 'exit' or mode == '4': dataload.logtime_print("User exit program") dataload.crawler_logo() # exit print logo exit(0) # input parameter error, into next circle else: dataload.logtime_print("Argument(s) error") else: mode_interactive_server = 2 # argument pass to variable opts, args = getopt.getopt(sys.argv[1:], "hm:r:l:s:i:", ["help", "mode", "R18", "list", "sex", "id"]) catch_mode = '1' rtn_r18_opt = '1' rtn_list_type = '1' rtn_mf_word = '' ira_illust_id = '' for opt, value in opts: if opt in ("-m", "--mode"): catch_mode = value elif opt in ("-r", "--R18"): rtn_r18_opt = value elif opt in ("-l", "--list"): rtn_list_type = value elif opt in ("-s", "--sex"): rtn_mf_word = value elif opt in ("-i", "--id"): ira_illust_id = value elif opt in ("-h", "--help"): print(PixivAPILib.__doc__) exit(0) api_instance = PixivAPILib(mode_interactive_server) # instance class to a object api_instance.camouflage_login() # crawler simulated login if catch_mode == '1': dataload.logtime_print('Mode: [Ranking Top N]') rtn_instance = rtn(dataload.RANK_DIR, dataload.LOG_PATH, dataload.HTML_PATH, api_instance, mode_interactive_server, rtn_r18_opt, rtn_list_type, rtn_mf_word) rtn_instance.start() # illustrator repositories all mode elif catch_mode == '2': dataload.logtime_print('Mode: [Illustrator Repository All]') ira_instance = ira(dataload.REPO_DIR, dataload.LOG_NAME, dataload.HTML_NAME, api_instance, mode_interactive_server, ira_illust_id) ira_instance.start() # help page elif catch_mode == 'help' or catch_mode == '3': print(PixivAPILib.__doc__)