def get_all_page_urls(pageKeyDic, page_urls, all_page_number): url_0 = page_urls[0] url = page_urls[1] url_pageKeyDic = pageKeyDic # print url previous_attrs_value_dict = {} all_url_list = [] for i in range(0, all_page_number + 1): current_url = url for key, value in url_pageKeyDic.items(): if isinstance(value[1], dict) is True: value0_list = list(value[0]) value0_list_len = len(value0_list) ch = " " for ch_index in range(0, value0_list_len): if (value0_list[ch_index].isdigit() is False): ch = value0_list[ch_index] value0_list[ch_index] = " " value0_list_splited = "".join(value0_list).split(" ") value0_list_len = len(value0_list_splited) for index in range(0, value0_list_len): if (value[1].has_key(index)): # print "???" if (i == 0): previous_attrs_value_dict[ index] = value0_list_splited[index] value0_list_splited[index] = str( int(previous_attrs_value_dict[index]) + int(value[1][index])) previous_attrs_value_dict[index] = value0_list_splited[ index] # print previous_attrs_value_dict[index] res_value = ch.join(value0_list_splited) current_url = current_url.replace(("%s=%s") % (key, value[0]), ("%s=%s") % (key, res_value)) else: if (i == 0): previous_attrs_value_dict[key] = int(value[0]) current_url = current_url.replace( ("%s=%s") % (key, value[0]), ("%s=%s") % (key, int(value[1]) + previous_attrs_value_dict[key])) previous_attrs_value_dict[key] = int( value[1]) + previous_attrs_value_dict[key] if (get_url_domain(url) not in current_url): url_sifter(get_partial_url(url_0), current_url) all_url_list.append(current_url) return all_url_list
def get_nav_in_url(soup, url, parser_method): allCategory_page_url = get_allCategory_from_Key(soup=soup) # 方法一:获取大分类页面 if (allCategory_page_url != None and "javascript" not in allCategory_page_url): # log_util.error("大分类页面:" + allCategory_page_url) allCategory_page_url = url_sifter(url, allCategory_page_url) # print ("大分类页面:" + allCategory_page_url) # print ("解析方法:%d,%d" % (parser_method, 1)) # if(methon == 2): # next_soup = get_soup_by_selenium_with_sleep(allCategory_page_url) # else:next_soup = get_soup_by_request(url) a_url_list = category_page_parser(allCategory_page_url, url, parser_method) return 1, a_url_list else: nav = get_nav_by_class_nav(soup, url) if nav == None: nav = get_nav_by_tag_ul(soup, url) # print nav way_number = 3 else: way_number = 2 if nav == None: return -1, None else: # print ("解析方法:%d,%d"%(parser_method,way_number)) return way_number, get_aTag_url_integration(nav, url)
def get_aTag_url_integration(original_data, domain): tmp_soup = get_soup_by_html_source(str(original_data)) a_list = tmp_soup.find_all("a") # print (a_list) a_url_res = [] for tag in a_list: a_url_res.append([tag.text, url_sifter(domain, tag.get("href"))]) # print (a_url_res) return a_url_res
def get_categoryList_method_in_index_url(url): resp = requests.get(url) soup = BeautifulSoup(resp.text, 'lxml') allCategory_page_url = get_allCategory_from_Key(soup=soup) # method = 1 if (allCategory_page_url != None): """ 这里实际上是进入下一页页面,一般写callback """ allCategory_page_url = url_sifter(url, allCategory_page_url) # print (allCategory_page_url) url_list = category_page_parser(allCategory_page_url) # print (len(url_list)) else: pass
def deep_search_get_searchUrl_and_keyword_in_soup(soup, url): res_url = None res_key = None res_method = "" for a in soup.find_all('a'): try: next_url = a.get('href') # and quote(a.text) in next_url http_code_key = quote(a.text.encode('utf-8')) origianl_key = a.text if (next_url != None and 'javascript' not in next_url and http_code_key != None and http_code_key != '' and origianl_key != None and origianl_key != ''): # and http_code_key != None # and http_code_key != '' and origianl_key != None if ('search' in next_url): if (origianl_key in next_url): res_url = next_url res_key = origianl_key res_method = "ORIGINALKEY" break if (http_code_key in next_url): res_url = next_url res_key = http_code_key res_method = "HTTPENCODEKEY" break re_str = '(%[\w\d]{2,4}\d*)+' # print next_url if (re.search(re_str, next_url)): res_key = re.search(re_str, next_url).group() res_url = next_url res_method = "REGULARHTTP" break except: pass # print res_url return [url_sifter(parent_url=url, url=res_url), res_key, res_method]
def category_page_parser(url, domain, parser_method): if parser_method == 1: soup = get_soup_by_request(url) # soup = BeautifulSoup(resq.text,"lxml") else: soup = get_soup_by_selenium_with_sleep(url) # print soup.prettify() tagPath_to_appearCount = {} tagPath_to_allTagInPath = {} max_appear_tag_path = "" max_appear_tag_number = 0 for current_tag in soup.find_all("a"): # get 'tag-path' such as html/body/div/div/ul/li/a tag_path = get_tag_path(current_tag) # Has 'tag-path' been appeared if (tag_path in tagPath_to_appearCount): tagPath_to_appearCount[tag_path] += 1 tagPath_to_allTagInPath[tag_path].append(current_tag) else: tagPath_to_appearCount[tag_path] = 1 tagPath_to_allTagInPath[tag_path] = [] tagPath_to_allTagInPath[tag_path].append(current_tag) sorted_tag_path_list = sorted(tagPath_to_appearCount.items(), key=lambda d: d[1], reverse=True) # for item in sort: # print "%s %s" % (sorted_tag_path_list[0], sorted_tag_path_list[1]) # all_category = tagPath_to_allTagInPath[sorted_tag_path_list[0][0]] # category_res_list = [] # category_name_set = set() # for tag in all_category: # # if(category_name_set) # # parent_deep = 1 # # # # while(parent_deep <=3 and tag.text != None and len(tag.text)!=0): # # # # # # # # parent_deep+=1 # # # print "-----------one menu----------------" # # parent_tag = tag.parent # # print parent_tag.text # # # # parent_tag = parent_tag.parent # # print parent_tag.text # # # parent_tag = parent_tag.parent # print parent_tag.text # print "-----------one menu----------------" # # while parent_tag != None and parent_tag.name != None: # # parent_threshold_num = sorted_tag_path_list[int(len(sorted_tag_path_list)/3)][1] # category_menu_1_list = [] # # print parent_threshold_num sorted(tagPath_to_appearCount.items(), key=lambda d: d[1]) for key, value in tagPath_to_appearCount.items(): # print key, ':', value if (max_appear_tag_number < value): max_appear_tag_number = value max_appear_tag_path = key all_category_tag_list = tagPath_to_allTagInPath[max_appear_tag_path] print(len(all_category_tag_list)) a_url_list = [] for tag in all_category_tag_list: # print tag.text a_url_list.append([tag.text, url_sifter(domain, tag.get("href"))]) return a_url_list
def get_pageUrls_and_all_pageNumber(url): driver = get_webdriver() attemps = 0 ATTEMPS_TIMES = 3 # 失败尝试3次 page_url_list = [] all_page_numer = -1 while (attemps < ATTEMPS_TIMES): driver.get(url) time.sleep(3) # print(driver.page_source) soup = get_soup_by_html_source(driver.page_source) is_find_page3_url = False # 当前是第一页,寻找分页中的第二页,通过第二页找到第三页的URL element_2_list = soup.find_all("a", text="2") number_to_url_dic = {} for elem in element_2_list: find_parent_times = 0 while (find_parent_times < 4 and is_find_page3_url is False): # descendants_list = [] if (find_parent_times == 0): elem_parent = elem.parent descendants_list = elem_parent.contents elif (find_parent_times == 1): elem_parent = elem.parent descendants_list = elem_parent.descendants else: elem_ancestor = elem for up_times in range(0, find_parent_times): elem_ancestor = elem_ancestor.parent descendants_list = elem_ancestor.descendants for descendant in descendants_list: if (descendant.name != None and descendant.name == 'a'): if descendant.text == '3': number_to_url_dic['2'] = elem.get("href") number_to_url_dic['3'] = descendant.get("href") number_to_url_dic['attrs_dic2'] = elem.attrs number_to_url_dic['attrs_dic3'] = descendant.attrs is_find_page3_url = True # print (elem.get("href")) if (descendant.name != None and descendant.name == 'a'): if descendant.text == '3': is_find_number = True elif (is_find_number and descendant.text.isdigit()): # print descendant.text all_page_numer = max(int(descendant.text), all_page_numer) if (is_find_number and descendant.name != None): allpage_text = descendant.parent.parent.parent.text try: tmp_number = int( re.search( "\d+", re.search( u"\d+\s*页", allpage_text).group()).group()) all_page_numer = max(tmp_number, all_page_numer) except: pass # print tmp_number find_parent_times += 1 if is_find_page3_url and all_page_numer != -1: break # try: url_2 = number_to_url_dic['2'] url_3 = number_to_url_dic['3'] """ 处理假URL的情况,比如有些URL是#,javascript;这里需要用driver动态跳转,获取current_url """ if (url_2.lower() == url_3.lower()): url_2 = get_url_by_attrs_dic(driver, number_to_url_dic["attrs_dic2"]) url_3 = get_url_by_attrs_dic(driver, number_to_url_dic["attrs_dic3"]) print "debug:%s" % url_2 if (url_2.lower() == url_3.lower()): return None page_url_list = [ url, url_sifter(url, url_2), url_sifter(url, url_3) ] break else: page_url_list = [ url, url_sifter(url, url_2), url_sifter(url, url_3) ] break driver.close() return all_page_numer, page_url_list
def get_next_urlList_by_firstpage_url(url): driver = webdriver.PhantomJS() # driver = get_webdriver() attemps = 0 ATTEMPS_TIMES = 3 # 失败尝试3次 FAILUED_STRING = "FAILUED_STRING" page_url_list = [] while (attemps < ATTEMPS_TIMES): driver.get(url) time.sleep(3) print driver.current_url # print(driver.page_source) soup = get_soup_by_html_source(driver.page_source) is_find_page3_url = False # 当前是第一页,寻找分页中的第二页,通过第二页找到第三页的URL element_2_list = soup.find_all("a", text="2") number_to_url_dic = {} for elem in element_2_list: find_parent_times = 0 while (find_parent_times < 4 and is_find_page3_url is False): # descendants_list = [] if (find_parent_times == 0): elem_parent = elem.parent descendants_list = elem_parent.contents elif (find_parent_times == 1): elem_parent = elem.parent descendants_list = elem_parent.descendants else: elem_ancestor = elem for up_times in range(0, find_parent_times): elem_ancestor = elem_ancestor.parent descendants_list = elem_ancestor.descendants for descendant in descendants_list: if (descendant.name != None and descendant.name == 'a'): if descendant.text == '3': number_to_url_dic['2'] = elem.get("href") number_to_url_dic['3'] = descendant.get("href") number_to_url_dic['attrs_dic2'] = elem.attrs number_to_url_dic['attrs_dic3'] = descendant.attrs is_find_page3_url = True # print (elem.get("href")) print("-----------------------------") find_parent_times += 1 if is_find_page3_url: break next_url_is_fake = False # try: url_2 = number_to_url_dic['2'] url_3 = number_to_url_dic['3'] """ 处理假URL的情况,比如有些URL是#,javascript;这里需要用driver动态跳转,获取current_url """ if (url_2.lower() == url_3.lower()): url_2 = get_url_by_attrs_dic(driver, number_to_url_dic["attrs_dic2"]) url_3 = get_url_by_attrs_dic(driver, number_to_url_dic["attrs_dic3"]) print "debug:%s" % url_2 # 出现解析问题,这个url可以跳过 if (url_2.lower() == url_3.lower()): return None page_url_list = [ url, url_sifter(url, url_2), url_sifter(url, url_3) ] break else: page_url_list = [ url, url_sifter(url, url_2), url_sifter(url, url_3) ] break driver.close() return page_url_list
def analysis_by_tag_return_goods_message(goods_list_tag, url): # print goods_list_tag.name # print goods_list_tag['class'] pic_size_regular = r'\d{2,}x\d{2,}' res_goods_list = [] for each_tag in goods_list_tag.contents: res_pic_url = '' res_price = '' res_detail_url = '' res_title = '' max_title_len = -1 max_pic_size = -1 res_goods_dict = {} if (each_tag.name != None): for inner_tag in each_tag.descendants: """ 商品列表页面本身含有一定的信息,此处暂时不做抓取(在商品详细页面抓取) 以下注释信息是对商品信息的抓取 """ if (inner_tag.name != None and is_single_tag(inner_tag)): # print inner_tag is_in_some_attri = False tag_text = inner_tag.text.replace('\n', "") # url if (res_detail_url == ''): try: detail_url = url_sifter(url=inner_tag['href'], parent_url=url) if ('javascript' not in detail_url and 'list' not in detail_url and 'search' not in detail_url and detail_url and ' ' not in detail_url and 'cart' not in detail_url): res_detail_url = detail_url is_in_some_attri = True except: pass # 价格 regular_str = '\d+\.+\d+' re_res = re.search(regular_str, tag_text) if (re_res and res_price == ''): res_price = re_res.group() # 搜索图片 if (inner_tag.name == 'img'): # print inner_tag # try: for attr_k, attr_v in inner_tag.attrs.items(): if isinstance(attr_k, str) is False: continue attr_inner = str(attr_k) if (attr_inner == 'class' or attr_inner == 'height' or attr_inner == 'width'): continue pic_url = attr_v # print pic_url if (('jpg' in pic_url or 'jpeg' in pic_url) and 'none' not in pic_url): # print pic_url if (res_pic_url == ''): res_pic_url = pic_url_sifter(url, pic_url) try: re_res = re.search( pic_size_regular, pic_url) if (re_res): re_res = re_res.group() re_res_splited = re_res.split('x') pic_size = max( int(re_res_splited[0]), int(re_res_splited[1])) max_pic_size = pic_size except: pass else: try: re_res = re.search( pic_size_regular, pic_url) if (re_res): re_res = re_res.group() re_res_splited = re_res.split('x') pic_size = max( int(re_res_splited[0]), int(re_res_splited[1])) if (pic_size > max_pic_size): max_pic_size = pic_size res_pic_url = pic_url_sifter( url, pic_url) except: pass is_in_some_attri = True # except: # print 'error' # pass tag_style = inner_tag.get('style') if (tag_style): regular_str = r'url\w*\(\S+\)' re_res = re.search(regular_str, str(tag_style)) if (re_res): pic_url = re_res.group().split('(')[1].split( ')')[0] if ('jpg' in pic_url or 'jpeg' in pic_url): if (res_pic_url == ''): res_pic_url = pic_url else: re_res = re.search(pic_size_regular, pic_url) if (re_res): re_res = re_res.group() re_res_splited = re_res.split('x') pic_size = max(int(re_res_splited[0]), int(re_res_splited[1])) if (pic_size > max_pic_size): max_pic_size = pic_size res_pic_url = pic_url is_in_some_attri = True if (is_in_some_attri == False and inner_tag.name != None): tag_text = inner_tag.text.replace('\n', "").replace(' ', '') is_tag_in_text = re.search('<\w+[^>]*>', str(tag_text)) if (is_tag_in_text): continue # print tag_text if (len(tag_text) > max_title_len): max_title_len = len(tag_text) res_title = tag_text # print res_title # # print res_price # print res_pic_url # print "-----------------------one goods-----------------------" res_goods_dict['title'] = res_title res_goods_dict['price'] = res_price res_goods_dict['pic_url'] = res_pic_url res_goods_dict['detail_url'] = res_detail_url res_goods_list.append(res_goods_dict) return res_goods_list
def analysis_json_data(url, soup): rank_dic = {} def get_json_path(container, json_path): # if(rank_dic.has_key(json_path)): rank_dic[json_path] += 1 # else: rank_dic[json_path] = 1 if (isinstance(container, dict)): # print container for key, value in container.items(): # print ("%s : %s")%(key,value) get_json_path(value, json_path + "/" + key) elif (isinstance(container, list)): # print container if (rank_dic.has_key(json_path)): rank_dic[json_path] += 1 else: rank_dic[json_path] = 1 # print json_path # return json_path for next_container in container: # print next_container get_json_path(next_container, json_path + "/a_list") # else: # print container # soup = get_soup_by_request(url) shop_json = "" maxlen = -1 for y in soup.find_all("script"): # print str(y) for x in re.findall("\{.*\}", str(y)): tmp_len = len(x) if (tmp_len > maxlen): maxlen = tmp_len shop_json = x json_praser = json.loads(shop_json) get_json_path(json_praser, "") second_dic = {} max_path_str = "" max_path_len = -1 for key, value in rank_dic.items(): if value > 20 and "a_list" in key: # print "(%s,%s)"%(key,value) tmp_str = (key).split('a_list')[0] if second_dic.has_key(tmp_str): second_dic[tmp_str] += 1 else: second_dic[tmp_str] = 1 if (second_dic[tmp_str] > max_path_len): max_path_len = second_dic[tmp_str] max_path_str = tmp_str # print max_path_str # print len(max_path_str.split('/')) def not_empty(s): return s and s.strip() json_key_list = list(filter(not_empty, max_path_str.split('/'))) json_key_index = 0 res_dic = json_praser while json_key_index < len(json_key_list): res_dic = res_dic[json_key_list[json_key_index]] json_key_index += 1 res_goods_dic_list = [] for li in res_dic: res_goods_dic = {} if (isinstance(li, dict) is not True): continue for key, value in li.items(): if (key is None or value is None): continue if ("price" in key and res_goods_dic.has_key('price') is False): res_goods_dic['price'] = value # print value elif ("title" in key and res_goods_dic.has_key('title') is False): res_goods_dic['title'] = re.sub('(?is)<.*?>', '', value) # print value elif ("detail" in key and res_goods_dic.has_key('detail_url') is False): # print value, key res_goods_dic['detail_url'] = url_sifter(url, value) # detail_urls_list.append(url_sifter(url, value)) # elif("comment" in key): # print value elif ((("img" in str(key)) or ('pic' in str(key)) or (".jpg" in str(value)) or ('.png' in str(value)) and res_goods_dic.has_key('pic_url') is False)): res_goods_dic['pic_url'] = pic_url_sifter(url, value) # print value # pic_urls_list.append(url_sifter(url, value)) res_goods_dic_list.append(res_goods_dic) return res_goods_dic_list