def get_album_info(self): """获取专辑信息""" url = 'http://download.csdn.net/addalbum/%d' % self.album_id page = netx.get(url, cookies=self.cookies, need_print=False) soup = BeautifulSoup(page, "html.parser") key_list = [ 'title', ('discription', 'textarea'), 'tag', ('categorys', 'select'), ('category', 'select'), 'type', 'imagesrc', 'ids', 'album', ] params = dict() for key in key_list: # 处理 tag 名 if len(key) == 2: key, tag_name = key else: tag_name = 'input' # 查找 if key == 'type': # 要选中的 key_tag = soup.find(tag_name, attrs={ 'name': key, 'checked': 'checked' }) else: key_tag = soup.find(tag_name, attrs={'name': key}) if not key_tag: print('没有找到 %s' % key) continue # 取值 if key == 'imagesrc': value = None elif key == 'ids': value = list() elif 'value' in key_tag.attrs: value = key_tag['value'] elif 'def' in key_tag.attrs: value = key_tag['def'] else: value = key_tag.text params[key] = value # 读取 文件 ul = soup.select_one('ul.add_source_list.items-list') a_list = ul.select('a.item') for a in a_list: params['ids'].append(a['id']) # 倒序排序 params['ids'].sort(reverse=True) self.params = params print(params)
def run(self): """运行""" url = 'https://phoenix.ziroom.com/v7/room/detail.json' params = { 'id': self.room_id, 'city_code': self.city_code } params = netx.parse_params(""" house_id 60163300 sign 3819b75389894cc18418b81e882d560a size 4 timestamp 1520072313 os android:7.0 network WIFI sign_open 1 app_version 5.5.0 imei 868030026509339 id 61015398 ip 192.168.199.128 uid 0 city_code 110000 page 1 model MI 5 """) print(url) print(params) while True: result = netx.get(url, params, result_type='json', need_print=False) if result['status'] == 'success': data = result['data'] print(result) return
def translate(en): js = Py4Js() tk = js.getTk(en) tk2 = get_google_tk(en) if tk != tk2: print('计算的 tk 不相等') filex.write('data/error_tk.txt', en + '\n', 'a') return en url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, urllib.parse.quote(en)) result = netx.get(url, need_print=False) """ [[["测试","test",null,null,2],[null,null,"Cèshì","test"]],...] """ if result: result = json.loads(result) if result: # 第一个结果 first_result = result[0] # 前几个为翻译,最后 1 个可能带拼音 cn = '' for translation in first_result: if len(translation) == 5: # 翻译中的第一个即是结果 cn += translation[0] cn = GoogleTranslator.process_result(en, cn) return cn return None
def get_dior_details(source_file, result_file): """获取口红详情""" lipstick_list = filex.read_lines(source_file, ignore_line_separator=True) length = len(lipstick_list) for i in range(length): lipstick = Lipstick.from_string(lipstick_list[i]) print('获取第 %d/%d个口红信息' % (i + 1, length)) url = ChooseLipstick.dior_host + urllib.parse.quote(lipstick.url) page = netx.get(url, need_print=False) soup = BeautifulSoup(page, "html.parser") cover_img_tag = soup.select_one('.png-bg.cover-bg') # all_image = cover_img['data-zoom-views'] cover_img = cover_img_tag.select_one('.js-cover-img')['src'] cover_img = ChooseLipstick.dior_host + cover_img # name = soup.select_one('.quickbuy-title').string # desc = soup.select_one('.quickbuy-subtitle').string price = soup.select_one('.details-price.js-order-value').string.strip() color_name = soup.select_one('.swatches-list').select_one('li.selected').select_one('a')['data-swatch-name'] # color_span = soup.select_one('.swatch-name.js-swatch-name') # color = color_span.select_one('span').string # swatches_list = soup.select_one('.swatches-list.js-products-selector') # swatches = swatches_list.select_one('li.selected') lipstick.url = url lipstick.price = price lipstick.name = color_name lipstick.img = ','.join((lipstick.img, cover_img)) filex.write_lines(result_file, [str(lipstick)], mode='a', add_line_separator=True)
def get_ysl_list(result_file): """读取口红列表""" # 官网的读不出来列表,反正也不多,手动加一下 url_list = [ 'http://www.yslbeautycn.com/product/00030YSL.html', 'http://www.yslbeautycn.com/product/00031YSL.html', ] result = list() i = 0 for details_url in url_list: page = netx.get(details_url, need_print=False) soup = BeautifulSoup(page, "html.parser") category = soup.select_one('.pdp_top_content_wrapper').select_one('.product_subtitle').string category = category.replace('圣罗兰', '') # image = soup.select_one('.primary_image')['src'] # color_2 = soup.select_one('.product_image.b-product_img')['src'] color_list = soup.select_one('.swatches.js_swatches.color.contentcarousel_list') for color_li in color_list.select('li'): for color_div in color_li.select('div'): url = color_div.select_one('a')['href'] color_image = color_div.select_one('img')['src'] name = color_div.select_one('span').string name = name.replace('(', '(').replace(')', ')') split_list = name.split('(', 1) if len(split_list) > 1: name = split_list[0].strip() other = '(' + split_list[1].strip() else: other = '' i += 1 lipstick = Lipstick('%03d' % i, category, name, url, '', other, color_image) result.append(str(lipstick)) filex.write_lines(result_file, result, add_line_separator=True)
def get_dior_list(result_file): """读取口红列表""" url = 'https://www.dior.cn/beauty/zh_cn/%E9%A6%99%E6%B0%9B%E4%B8%8E%E7%BE%8E%E5%AE%B9/%E5%BD%A9%E5%A6%86/%E5' \ '%94%87%E9%83%A8/%E5%94%87%E8%86%8F/fr-lipsticks-%E5%94%87%E8%86%8F.html ' page = netx.get(url, need_print=False) # 解析结果 soup = BeautifulSoup(page, "html.parser") result = list() i = 0 for category in soup.select('.category.js-category'): '大的分组' category_name = category.select_one('.category-title').string.replace('Dior迪奥', '') print('\n分组:%s' % category_name) for column in category.select('.column.product'): '每一个系列' legend_name = column.select_one('.legend-name').string.replace('Dior迪奥', '') legend_desc = column.select_one('.legend-description').string.strip() print('系列名:' + legend_name) legend_swatches_list = column.select_one('.legend-swatches-list') for legend_li in legend_swatches_list.select('li'): a = legend_li.find('a') url = a['href'] color = a.find('img') image = ChooseLipstick.dior_host + color['src'] i += 1 lipstick = Lipstick('%03d' % i, category_name + '-' + legend_name, '', url, '', legend_desc, image) result.append(str(lipstick) + '\n') filex.write_lines(result_file, result)
def test_browse(self): """ 测试访问 通过学习加密我们知道,discuz 将相关内容保存在了 cookie 中 解密时,没有过期时间,会一直解密成功,然后用密码去判断(该密码仅用于判断,与真实密码无关) cookie 是会失效的,但是如果我们将 cookie 保存下来,那么就可以一直使用了,永不失效(除非更改密码)。 """ url = 'http://localhost/' r = netx.get(url, need_print=False) if '退出' in r: print('已登录') else: print('未登录') r = netx.get(url, cookies=netx.parse_cookies_from_file(self.cookie_file), need_print=False) if '退出' in r: print('已登录') else: print('未登录')
def get_in_thread(self, element, element_index, thread_id): url = self.url url += str(int(time.time() * 1000)) print(url) result = netx.get(url, need_print=False) result = result[result.index('(') + 1:result.index(')')] result = json.loads(result) num_list = list(filter(lambda x: len(str(x)) == 11, result['numArray'])) num_list = [str(x) for x in num_list] print(num_list) print(f'获取到号码 {len(num_list)} 个') num_list = list(filter(lambda x: x not in self.numbers, num_list)) print(f'过滤后,剩号码 {len(num_list)} 个') self.numbers.extend(num_list) print(f'当前号码 {len(self.numbers)} 个')
def get_ysl_details(source_file, result_file): lines = filex.read_lines(source_file, ignore_line_separator=True) length = len(lines) for i in range(length): print('获取 %d/%d ' % (i + 1, length)) line = lines[i] lipstick = Lipstick.from_string(line) # 有一些颜色没指定,打开会转到默认颜色 page = netx.get(lipstick.url, need_print=False) soup = BeautifulSoup(page, "html.parser") cover_image = soup.select_one('.primary_image')['src'] color_image2 = soup.select_one('.product_tab_shades_left').select_one('.product_image.b-product_img')['src'] price = soup.select_one('.product_price.price_sale.b-product_price-sale').text.strip() lipstick.img = ','.join((lipstick.img, color_image2, cover_image)) lipstick.price = price filex.write_lines(result_file, [str(lipstick)], mode='a', add_line_separator=True)
def get_num(self): url = self.url url += str(int(time.time() * 1000)) print(url) result = netx.get(url) result = result[result.index('(') + 1:result.index(')')] result = json.loads(result) num_list = list(filter(lambda x: len(str(x)) == 11, result['numArray'])) num_list = [str(x) for x in num_list] print(num_list) print(f'获取到号码 {len(num_list)} 个') old_numbers = filex.read_lines(self.num_file_path, ignore_line_separator=True) print(f'之前有号码 {len(old_numbers)} 个') num_list = list(filter(lambda x: x not in old_numbers, num_list)) print(f'过滤后,剩号码 {len(num_list)} 个') filex.write_lines(self.num_file_path, num_list, 'a', add_line_separator=True)
def get_floor_of_page(tid, page, result_file): """ 获取某一页的所有楼层 :param tid: 贴子id :param page: 页数 :param result_file:结果文件 :return: """ url = 'http://tieba.baidu.com/p/%s?pn=%d' % (tid, page) page = netx.get(url) print('开始解析') # 解析结果 soup = BeautifulSoup(page, "html.parser") floor = soup.select('.l_post.j_l_post.l_post_bright') result = [] for div in floor: # 含有clearfix的是广告 if 'clearfix' not in div['class']: data_field = json.loads(div['data-field']) # print(data_field) author = data_field['author'] name = author['user_name'] level_id = author['level_id'] content = data_field['content'] post_no = content['post_no'] post_id = content['post_id'] content = div.select_one('#post_content_' + str(post_id)) if content is not None: content = content.text else: content = '' result.append(str(Floor(floor_no=post_no, name=name, level=level_id, content=content.lstrip())) + '\n') filex.write_lines(result_file, result, mode='a')