def get_all_page_urls(pageKeyDic, page_urls, all_page_number): url_0 = page_urls[0] url = page_urls[1] url_pageKeyDic = pageKeyDic # print url previous_attrs_value_dict = {} all_url_list = [] for i in range(0, all_page_number + 1): current_url = url for key, value in url_pageKeyDic.items(): if isinstance(value[1], dict) is True: value0_list = list(value[0]) value0_list_len = len(value0_list) ch = " " for ch_index in range(0, value0_list_len): if (value0_list[ch_index].isdigit() is False): ch = value0_list[ch_index] value0_list[ch_index] = " " value0_list_splited = "".join(value0_list).split(" ") value0_list_len = len(value0_list_splited) for index in range(0, value0_list_len): if (value[1].has_key(index)): # print "???" if (i == 0): previous_attrs_value_dict[ index] = value0_list_splited[index] value0_list_splited[index] = str( int(previous_attrs_value_dict[index]) + int(value[1][index])) previous_attrs_value_dict[index] = value0_list_splited[ index] # print previous_attrs_value_dict[index] res_value = ch.join(value0_list_splited) current_url = current_url.replace(("%s=%s") % (key, value[0]), ("%s=%s") % (key, res_value)) else: if (i == 0): previous_attrs_value_dict[key] = int(value[0]) current_url = current_url.replace( ("%s=%s") % (key, value[0]), ("%s=%s") % (key, int(value[1]) + previous_attrs_value_dict[key])) previous_attrs_value_dict[key] = int( value[1]) + previous_attrs_value_dict[key] if (get_url_domain(url) not in current_url): # if(current_url.startswith("?")): # # print url_0.split("?")[0] # current_url = url_0.split("?")[0] + current_url # # print current_url # elif(current_url.startswith("/")): # current_url = re.findall(".*\.com",url_0)[0]+current_url url_sifter(get_partial_url(url_0), current_url) all_url_list.append(current_url) return all_url_list
def get_store(soup, url): soup = get_soup_by_request(url) store_keys = [u'旗舰店', u'进入店', u'店铺', u'进店', u'店'] for keyword in store_keys: res_url = _get_store_by_key(soup, keyword) if (res_url != None and res_url != []): test_url = url_sifter(url, res_url) print get_soup_by_request(test_url).find('title').text return url_sifter(url, res_url)
def get_nav_in_url(soup, url, parser_method): allCategory_page_url = get_allCategory_from_Key(soup=soup) # 方法一:获取大分类页面 if (allCategory_page_url != None and "javascript" not in allCategory_page_url): # log_util.error("大分类页面:" + allCategory_page_url) allCategory_page_url = url_sifter(url, allCategory_page_url) # print ("大分类页面:" + allCategory_page_url) # print ("解析方法:%d,%d" % (parser_method, 1)) # if(methon == 2): # next_soup = get_soup_by_selenium_with_sleep(allCategory_page_url) # else:next_soup = get_soup_by_request(url) a_url_list = category_page_parser(allCategory_page_url, url, parser_method) return 1, a_url_list else: nav = get_nav_by_class_nav(soup, url) if nav == None: nav = get_nav_by_tag_ul(soup, url) # print nav way_number = 3 else: way_number = 2 if nav == None: return -1, None else: print("解析方法:%d,%d" % (parser_method, way_number)) return way_number, get_aTag_url_integration(nav, url)
def get_aTag_url_integration(original_data, domain): tmp_soup = get_soup_by_html_source(str(original_data)) a_list = tmp_soup.find_all("a") print(a_list) a_url_res = [] for tag in a_list: a_url_res.append([tag.text, url_sifter(domain, tag.get("href"))]) print(a_url_res) return a_url_res
def get_categoryList_method_in_index_url(url): resp = requests.get(url) soup = BeautifulSoup(resp.text, 'lxml') allCategory_page_url = get_allCategory_from_Key(soup=soup) # method = 1 if (allCategory_page_url != None): """ 这里实际上是进入下一页页面,一般写callback """ allCategory_page_url = url_sifter(url, allCategory_page_url) print(allCategory_page_url) url_list = category_page_parser(allCategory_page_url) print(len(url_list)) else: pass
def category_page_parser(url, domain, parser_method): if parser_method == 1: soup = get_soup_by_request(url) # soup = BeautifulSoup(resq.text,"lxml") else: soup = get_soup_by_selenium_with_sleep(url) # print soup.prettify() tagPath_to_appearCount = {} tagPath_to_allTagInPath = {} max_appear_tag_path = "" max_appear_tag_number = 0 for current_tag in soup.find_all("a"): # get 'tag-path' such as html/body/div/div/ul/li/a tag_path = get_tag_path(current_tag) # Has 'tag-path' been appeared if (tag_path in tagPath_to_appearCount): tagPath_to_appearCount[tag_path] += 1 tagPath_to_allTagInPath[tag_path].append(current_tag) else: tagPath_to_appearCount[tag_path] = 1 tagPath_to_allTagInPath[tag_path] = [] tagPath_to_allTagInPath[tag_path].append(current_tag) sorted_tag_path_list = sorted(tagPath_to_appearCount.items(), key=lambda d: d[1], reverse=True) # for item in sort: # print "%s %s" % (sorted_tag_path_list[0], sorted_tag_path_list[1]) # all_category = tagPath_to_allTagInPath[sorted_tag_path_list[0][0]] # category_res_list = [] # category_name_set = set() # for tag in all_category: # # if(category_name_set) # # parent_deep = 1 # # # # while(parent_deep <=3 and tag.text != None and len(tag.text)!=0): # # # # # # # # parent_deep+=1 # # # print "-----------one menu----------------" # # parent_tag = tag.parent # # print parent_tag.text # # # # parent_tag = parent_tag.parent # # print parent_tag.text # # # parent_tag = parent_tag.parent # print parent_tag.text # print "-----------one menu----------------" # # while parent_tag != None and parent_tag.name != None: # # parent_threshold_num = sorted_tag_path_list[int(len(sorted_tag_path_list)/3)][1] # category_menu_1_list = [] # # print parent_threshold_num sorted(tagPath_to_appearCount.items(), key=lambda d: d[1]) for key, value in tagPath_to_appearCount.items(): # print key, ':', value if (max_appear_tag_number < value): max_appear_tag_number = value max_appear_tag_path = key all_category_tag_list = tagPath_to_allTagInPath[max_appear_tag_path] print(len(all_category_tag_list)) a_url_list = [] for tag in all_category_tag_list: # print tag.text a_url_list.append([tag.text, url_sifter(domain, tag.get("href"))]) return a_url_list
def analysis_by_tag(goods_list_tag, url): detail_url_set = set() for each_tag in goods_list_tag.contents: if (each_tag.name != None): current_url_list = [] for inner_tag in each_tag.descendants: """ 商品列表页面本身含有一定的信息,此处暂时不做抓取(在商品详细页面抓取) 以下注释信息是对商品信息的抓取 """ # if(inner_tag.name!=None and is_single_tag(inner_tag)): # # print inner_tag # tag_text = inner_tag.text.replace('\n',"") # if(message_len < len(tag_text)): # message_len = len(tag_text) # message = tag_text # # # if(u'¥' in tag_text): # print tag_text # # elif(u'评价' in tag_text or u'评论' in tag_text): # re_comment_res = re.search(u'\d+\+{0,1}人{0,1}评价|\d+\+{0,1}人{0,1}评论|\d+条评论', tag_text) # if re_comment_res !=None: # print re_comment_res.group() if (inner_tag.name != None and inner_tag.name == 'a'): try: detail_url = url_sifter(url=inner_tag['href'], parent_url=url) if ('javascript' not in detail_url and 'list' not in detail_url and 'search' not in detail_url and detail_url not in current_url_list): current_url_list_len = len(current_url_list) check_flag = True for i in range(0, current_url_list_len): if (detail_url in current_url_list[i]): current_url_list[i] = detail_url check_flag = False break elif (current_url_list[i] in detail_url): check_flag = False break if (check_flag == True): current_url_list.append(detail_url) except: pass detail_url_set = detail_url_set | (set(current_url_list)) res_detail_urls_list = urls_clustering(list(detail_url_set)) res_max_len = -1 res_max_list = [] for i in res_detail_urls_list: i_len = len(i) if (res_max_len < i_len): res_max_len = i_len res_max_list = i #debug pprint.pprint(res_max_list) # urls_clustering(res_max_list) print len(res_max_list) return res_max_list