def second_layer_processor(layer_description, exceptions_log_file): next_layer_url_dict = dict() print 'From second_layer_processor, layer_description: ', layer_description info_container_xpath = layer_description['info_container_xpath'] info_tag_xpath = layer_description['info_tag_xpath'] for category_url, category_name in layer_description['items_urls_and_descriptions'].items(): #for category_url in layer_description['items_urls_and_descriptions']: url = '%s%s' % (root_site_url, category_url) print 'second_layer_processor, root_site_url/category_url: ', url #info_items_set = set() #html = get_html(url, user_agent_header) html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) tree = etree.parse(StringIO(html), parser) info_container = tree.xpath(info_container_xpath) #tree.write('fundays_second_layer_pretty.html', pretty_print=True) if info_container != []: print 'second_layer_processor, info_container != []' info_items = info_container[0].xpath(info_tag_xpath)[::2] for info_item in info_items: print 'second_layer_processor, info_item.attrib[href]: ', info_item.attrib['href'] #info_items_set.add(info_item.attrib['href']) if next_layer_url_dict.has_key(info_item.attrib['href']): next_layer_url_dict[info_item.attrib['href']]['categories'].append(category_name) else: next_layer_url_dict[info_item.attrib['href']] = {'categories' : [category_name]} else: print 'second_layer_processor, info_container == []' print 'second_layer_processor, next_layer_url_dict: ', next_layer_url_dict other_pages_list_container = info_container[0].xpath(layer_description['other_category_pages_xpath']) print 'second_layer_processor, other_pages_list_container: ', other_pages_list_container for page_container in other_pages_list_container: print 'second_layer_processor, page_container: ', page_container if page_container.text != None and page_container.text.strip() == 'Go to Page:': print 'second_layer_processor, page_container.text.strip(): ', page_container.text.strip() print 'second_layer_processor, len(page_container.getchildren())', len(page_container.getchildren()) next_page_index = 2 print 'second_layer_processor, next_page_index: ', next_page_index for child in page_container.getchildren()[1:]: print 'second_layer_processor, next page url: %s-%s' % (url, str(next_page_index)) urls_from_next_page = second_layer_single_page_processor('%s-%s' % (url, str(next_page_index)), info_container_xpath, info_tag_xpath, exceptions_log_file) print 'second_layer_processor, urls_from_next_page: ', urls_from_next_page for url_from_next_page in urls_from_next_page: print 'second_layer_processor, url_from_next_page: ', url_from_next_page if next_layer_url_dict.has_key(url_from_next_page): #next_layer_url_dict[url_from_next_page].append(category_name) next_layer_url_dict[url_from_next_page]['categories'].append(category_name) else: #next_layer_url_dict[url_from_next_page] = [category_name] next_layer_url_dict[url_from_next_page] = {'categories' : [category_name]} #next_layer_url_set = next_layer_url_set.union(second_layer_single_page_processor('%s-%s' % (url, str(next_page_index)), info_container_xpath, info_tag_xpath)) next_page_index = next_page_index + 1 else: print 'info_container == []' return next_layer_url_dict
def get_topic_list(board_url): page_source = get_html(board_url.decode('utf8')) # Hack quotation marks page_source = page_source.replace('"', '\"') if page_source is not None: topic_parser = TopicParser() return topic_parser.feed(page_source) else: return None
def fourth_layer_media_processor(url, img_xpath, venue_id, exceptions_log_file): media_url = '%s/media' % url html = get_html(media_url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) tree = etree.parse(StringIO(html), parser) img_container = tree.xpath(img_xpath) img_url_list = [img_tag.attrib['src'] for img_tag in img_container] img_num = 0 for img_url in img_url_list: img_file_name = '%s_%s' % (str(venue_id), str(img_num)) img_num = img_num + 1 img_download(img_url, img_file_name, img_dir, exceptions_log_file) return img_url_list
def first_layer_processor(layer_description, exceptions_log_file): next_layer_url_list = list() for url in layer_description['url_list']: html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) info_container = tree.xpath(layer_description['info_container_xpath']) info_items = info_container[0].xpath(layer_description['info_tag_xpath']) for info_item in info_items: next_layer_url_list.append(info_item.attrib['href']) print 'From first layer processor: next_url_list length %s, next_url_list %s' % (len(next_layer_url_list), next_layer_url_list) return next_layer_url_list
def first_layer_processor(layer_description, exceptions_log_file): next_layer_url_dict = dict() for url in layer_description['items_urls_and_descriptions'].keys(): #html = get_html(url, user_agent_header) html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) info_container = tree.xpath(layer_description['info_container_xpath']) #print info_container #print(etree.tostring(info_container[0], pretty_print=True)) info_items = info_container[0].xpath(layer_description['info_tag_xpath']) for info_item in info_items: #next_layer_url_list.append(info_item.attrib['href']) #next_layer_url_list.append(info_item.attrib['href']) next_layer_url_dict[info_item.attrib['href']] = info_item.attrib['title'] print 'From first layer processor: next_url_list length %s, next_url_list %s' % (len(next_layer_url_dict), next_layer_url_dict) return next_layer_url_dict
def second_layer_processor(layer_description, exceptions_log_file): next_layer_url_list = list() for url in layer_description['url_list']: info_items = list() html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) tree = etree.parse(StringIO(html), parser) info_container = tree.xpath(layer_description['info_container_all_xpath']) if info_container != []: info_items = info_container[0].xpath(layer_description['info_tag_xpath']) else: info_container = tree.xpath(layer_description['info_container_xpath']) if info_container != []: info_items = info_container[0].xpath(layer_description['info_tag_xpath']) if info_items != []: for info_item in info_items: next_layer_url_list.append(info_item.attrib['href']) print 'From second layer processor: next_url_list length %s' % len(next_layer_url_list) for url in next_layer_url_list: print url return next_layer_url_list
def fourth_layer_location_processor(url, jscript_xpaths, exceptions_log_file): location_coordinates = list() coordinates_str_marker = 'LatLng(' marker_len = len(coordinates_str_marker) coordinates_str_end_marker = ')' location_url = '%s/location/' % url html = get_html(location_url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) tree = etree.parse(StringIO(html), parser) jscript_container = tree.xpath(jscript_xpaths) jscript_texts = [jscript.text for jscript in jscript_container] if len(jscript_texts) == 2: coordinates_jscript_text = jscript_texts[1] coordinates_substr_begin = coordinates_jscript_text.find(coordinates_str_marker) if coordinates_substr_begin != -1: coordinates_jscript_text = coordinates_jscript_text[coordinates_substr_begin + marker_len:] coordinates_substr_end = coordinates_jscript_text.find(coordinates_str_end_marker) if coordinates_substr_end != -1: location_coordinates = coordinates_jscript_text[:coordinates_substr_end].split(',') return location_coordinates
def third_layer_processor(layer_description, exceptions_log_file): next_layer_url_list = list() i = 0 for url in layer_description['url_list']: print 'Third layer outer cycle iteration: %s' % str(i) i = i + 1 current_page_url = url j = 0 is_next_button = True while is_next_button: print 'Third layer inner cycle iteration: %s' % str(j) j = j + 1 print 'Current page url: ' + current_page_url html = get_html(current_page_url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) print 'Third layer html', html tree = etree.parse(StringIO(html), parser) info_container = tree.xpath(layer_description['info_container_xpath']) info_items = info_container[0].xpath(layer_description['info_tag_xpath']) for info_item in info_items: next_layer_url_list.append(info_item.attrib['href']) print info_item.attrib['href'] print 'Third layer, next layer url list len: %s' % str(len(next_layer_url_list)) next_page_item = info_container[0].xpath(layer_description['next_button_xpath']) if next_page_item != []: print 'Next page item list not empty' print 'Next page button id ' + next_page_item[0].attrib['id'] if next_page_item[0].attrib['id'] == 'ctl00_cphMain_btnNext': is_next_button = True current_page_url = next_page_item[0].attrib['href'] print 'Next button url:' + current_page_url else: print 'None next button' is_next_button = False else: is_next_button = False print 'Next page item list empty' return next_layer_url_list
def second_layer_single_page_processor(url, info_container_xpath, info_tag_xpath, exceptions_log_file): print 'Entry in second_layer_single_page_processor' urls_list = list() print 'second layer single page url: ', url html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) #html = get_saved_content('fundays_second_layer_requests.txt') #print 'second_layer_single_page_processor, html: ', html.encode('utf-8') tree = etree.parse(StringIO(html), parser) #info_container_xpath = '/html/body/div[@class="container"]/div[@class="wrapper"]/table/tr/td[@class="pagebg"]/table/tr/td[@class="mid_pad"]/table[@class="list_detail"]/tr/td/table' #info_tag_xpath = 'tr/td/div/table/tr/td/a' info_container = tree.xpath(info_container_xpath) #print(etree.tostring(info_container[0], pretty_print=True)) print 'second_layer_single_page_processor, info_container: ', info_container if info_container != []: pass info_items = info_container[0].xpath(info_tag_xpath)[::2] #info_items = info_container[0].xpath(info_tag_xpath) for info_item in info_items: print 'second_layer_single_page_processor, info_item.attrib[\'href\']: ', info_item.attrib['href'] #urls_set.add(info_item.attrib['href']) urls_list.append(info_item.attrib['href']) print 'second_layer_single_page_processor, urls_list: ', urls_list return urls_list
def third_layer_data_slice_processor(args): layer_description, items_urls_and_descriptions, urls_list_slice, start_venue_id, exceptions_log_file = args result_dict = dict() venue_id = start_venue_id for url_local_part in urls_list_slice: url = root_site_url + '/' + url_local_part venue_description = dict() venue_description['id'] = venue_id html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) #html = get_saved_content('fundays_third_layer_requests.txt') parser = etree.HTMLParser(remove_blank_text=True, strip_cdata=False) tree = etree.parse(StringIO(html), parser) #tree.write('fundays_third_layer_pretty.html', pretty_print=True) info_container_list = tree.xpath(layer_description['info_container_xpath']) #print info_container_list #address_node = info_container_list[0].xpath('//div[@itemprop="address"]')[0] #print get_xpath_to_root(address_node) print 'info_container_list: ', info_container_list if info_container_list != []: info_container = info_container_list[0] address_node_container = info_container.xpath(layer_description['address_node_xpath']) advert_text = '' if address_node_container != []: address_node = address_node_container[0] #print address_node if address_node.tail != None: advert_text = advert_text + address_node.tail + ' ' current_advert_node = address_node.getnext() while(current_advert_node != None and current_advert_node.tag != 'h3'): if current_advert_node.text != None: advert_text = advert_text + current_advert_node.text + ' ' if current_advert_node.tail != None: advert_text = advert_text + current_advert_node.tail + ' ' strong_subnode_container = current_advert_node.xpath('strong') if strong_subnode_container != []: for strong_subnode in strong_subnode_container: if strong_subnode.text != None: advert_text = advert_text + strong_subnode.text + ' ' if strong_subnode.tail != None: advert_text = advert_text + strong_subnode.tail + ' ' current_advert_node = current_advert_node.getnext() #print advert_text venue_description['advert_text'] = advert_text contacts_header_list = info_container.xpath(layer_description['contacts_header_xpath']) for contacts_header in contacts_header_list: if 'Contact' in contacts_header.text: #print contacts_header.text[len('Contact') + 1:] venue_description['name'] = contacts_header.text[len('Contact') + 1:] phone_container = info_container.xpath(layer_description['phone_xpath']) if phone_container != []: phone = phone_container[0].text #print phone venue_description['phone'] = phone info_header_tags = info_container.xpath(layer_description['info_header_tag_xpath']) for info_header_tag in info_header_tags: header_text = info_header_tag.text.strip() if header_text == 'International:': international_phone = info_header_tag.tail.strip() #print international_phone venue_description['international_phone'] = international_phone if header_text == 'Email:': email = get_next_text(info_header_tag) #print email venue_description['email'] = email if header_text == 'Website:': site = get_next_href(info_header_tag) #print site venue_description['site'] = site if header_text == 'Location Map:': location_link = get_next_href(info_header_tag) if location_link != None: location_coordinates = location_link[location_link.find('=') + 1:].split(',') #print location_coordinates venue_description['location_coordinates'] = location_coordinates region_container = info_container.xpath(layer_description['region_address_xpath']) if region_container != []: venue_description['region'] = '' if region_container[0].text != None: region = region_container[0].text.strip() venue_description['region'] = region #print region street_address_container = info_container.xpath(layer_description['street_address_xpath']) if street_address_container != []: street_address = '' if street_address_container[0].text != None: street_address = street_address_container[0].text.strip() #print street_address venue_description['street_address'] = street_address list_detail_container = tree.xpath(layer_description['list_detail_xpath']) img_container_list = tree.xpath(layer_description['img_xpath']) print 'img_container_list: ', img_container_list img_num = 0 img_list = list() for img_container in img_container_list: print 'img_container: ', img_container if img_container != None: img_url = img_container.attrib.get('src') #print img_url if img_url != None and img_url != '': img_list.append(img_url) img_file_name = '%s_%s' % (str(venue_id), str(img_num)) img_num = img_num + 1 img_download(root_site_url + img_url, img_file_name, img_dir, exceptions_log_file) venue_description['img_urls'] = img_list #items_urls_and_descriptions[url_local_part]['description'] = venue_description categories = items_urls_and_descriptions[url_local_part]['categories'] #categories = {} #items_urls_and_descriptions[url_local_part].update({'description' : venue_description}) #items_urls_and_descriptions[url_local_part] = {'categories' : categories, 'description' : venue_description} result_dict[url_local_part] = {'categories' : categories, 'description' : venue_description} #print venue_description venue_id = venue_id + 1 print start_venue_id print result_dict[url_local_part] #return items_urls_and_descriptions #queue.put(result_dict) return result_dict
def fourth_layer_data_slice_processor(args): layer_description, urls_list_slice, start_venue_id, exceptions_log_file = args venue_list = list() venue_id = start_venue_id for url in urls_list_slice: venue_description = dict() venue_description['id'] = venue_id info_items = list() html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) tree = etree.parse(StringIO(html), parser) info_container = tree.xpath(layer_description['info_container_xpath']) if info_container != []: contacts_items = info_container[0].xpath(layer_description['contacts_info_xpath']) contact_item = contacts_items[0] print contact_item.attrib['href'] name_container = tree.xpath(layer_description['data_venue_name_xpath']) print name_container[0].text venue_description['name'] = name_container[0].text venue_description['category'] = 'food' data_items = info_container[0].xpath(layer_description['data_xpath']) for item in data_items: data_keyword_list = item.xpath(layer_description['data_keyword_xpath']) if data_keyword_list != []: if data_keyword_list[0].text == 'CUISINES': data_text_list = item.xpath(layer_description['data_text_xpath']) print data_keyword_list[0].text print data_text_list[0].tail venue_description['cuisines'] = data_text_list[0].tail.strip().encode('utf-8').replace('\n', ' ') cuisines_list = venue_description['cuisines'].split(',') venue_description['cuisines_main'] = cuisines_list[0] venue_description['cuisines_rest'] = cuisines_list[1:] if data_keyword_list[0].text == 'WEBSITE': web_text_list = item.xpath(layer_description['data_web_xpath']) print data_keyword_list[0].text print web_text_list[0].attrib['href'] venue_description['website'] = web_text_list[0].attrib['href'] contacts_page_xpaths = layer_description['contacts_page_xpaths'] contacts_page_html = get_html(root_site_url + contact_item.attrib['href'], user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) contacts_page_tree = etree.parse(StringIO(contacts_page_html), parser) contacts_page_info_container_list = contacts_page_tree.xpath(contacts_page_xpaths['info_container_xpath']) if contacts_page_info_container_list != []: contacts_page_info_subcontainers_list = contacts_page_info_container_list[0].xpath(contacts_page_xpaths['info_subcontainer_xpath']) for subcontainer in contacts_page_info_subcontainers_list: subcontainer_name = subcontainer.xpath(contacts_page_xpaths['subcontainer_name_xpath'])[0].text subcontainer_data = subcontainer.xpath(contacts_page_xpaths['subcontainer_data_xpath'])[0].text if subcontainer_name == 'Address': print 'Address: ', subcontainer_data venue_description['address'] = subcontainer_data venue_description['address_components'] = subcontainer_data.split(', ') for address_component in venue_description['address_components']: print address_component if subcontainer_name == 'Telephone': print 'Phone: ', subcontainer_data venue_description['phone'] = subcontainer_data media_url_list = fourth_layer_media_processor(url, layer_description['img_xpath'], venue_id, exceptions_log_file) venue_description['media_url_list'] = media_url_list for media_url in venue_description['media_url_list']: print media_url venue_advert_text_container = tree.xpath(layer_description['venue_advert_text_xpath']) if venue_advert_text_container != []: venue_description['venue_advert_text'] = venue_advert_text_container[0].text print venue_description['venue_advert_text'] location_coordinates = fourth_layer_location_processor(url, layer_description['location_xpath'], exceptions_log_file) venue_description['location_coordinates'] = location_coordinates print location_coordinates venue_list.append(venue_description) if len(venue_list) != 0: print venue_list[-1] else: print 'venue_list empty' else: print 'Fourth layer info container is empty list', 'url: ' + url venue_id = venue_id + 1 return venue_list
def delete_topic(self, topic_url): try: page_source = get_html(topic_url, Admin.COOKIE_PATH).encode('utf8') tbs = re.search('tbs\:\"(.+)\"', page_source).groups()[0] kw = re.search('forum_name\:\"(.+?)\"', page_source).groups()[0] fid = re.search('fid\:\'(.+?)\'', page_source).groups()[0] tid = re.search('tid\:\'(.+?)\'', page_source).groups()[0] post_params = [ ('ie', 'utf-8'), ('tbs', tbs), ('kw', kw), ('fid', fid), ('tid', tid), ] curl = pycurl.Curl() curl.setopt(pycurl.URL, settings.TOPIC_DELETE_POINT) # Set referer curl.setopt(pycurl.REFERER, str(topic_url)) # Ignore SSL. curl.setopt(pycurl.SSL_VERIFYPEER, False) # Follow redirection. curl.setopt(pycurl.FOLLOWLOCATION, True) # Set user agent curl.setopt(pycurl.USERAGENT, settings.USER_AGENT) # POST curl.setopt(pycurl.POST, 1) curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(post_params)) # Set custom header custom_header = [ 'X-Requested-With: XMLHttpRequest', 'DNT: 1', 'Accept: application/json, text/javascript, */*; q=0.01', 'Accept-Language: en-us,en;q=0.5', 'Accept-Encoding: gzip, deflate', 'Pragma: no-cache', 'Cache-Control: no-cache', 'HeaderEnd: CRLF', 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8', ] curl.setopt(pycurl.HTTPHEADER, custom_header) # Set cookie file curl.setopt(pycurl.COOKIEFILE , Admin.COOKIE_PATH) # Set content buffer content = StringIO.StringIO() curl.setopt(pycurl.WRITEFUNCTION, content.write) # Set header buffer header = StringIO.StringIO() curl.setopt(pycurl.HEADERFUNCTION, header.write) curl.perform() except Exception, e: log(unicode(traceback.format_exc())) return
def third_layer_data_slice_processor(args): result_dict = dict() layer_description, items_urls_and_descriptions, urls_list_slice, start_venue_id, exceptions_log_file = args venue_id = start_venue_id for local_url_part in urls_list_slice: full_description = dict() full_description["id"] = venue_id url = root_site_url + local_url_part html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) contacts_container = tree.xpath(layer_description["contacts_container_xpath"]) name_container = tree.xpath(layer_description["name_xpath"]) if name_container != []: name_node = name_container[0] full_description["name"] = name_node.text if contacts_container != []: print contacts_container contacts_container_node = contacts_container[0] address_components = contacts_container_node.xpath(layer_description["address_components_xpath"]) address = "" if address_components != []: for address_component in address_components: address = address + address_component.text + " " print address full_description["address"] = address phone_node_container = contacts_container_node.xpath(layer_description["phone_xpath"]) full_description["phone"] = "" if phone_node_container != []: phone_node = phone_node_container[0] print phone_node.text full_description["phone"] = phone_node.text email_node_container = contacts_container_node.xpath(layer_description["email_xpath"]) full_description["email"] = "" if email_node_container != []: email_node = email_node_container[0] print email_node.text full_description["email"] = email_node.text web_node_container = contacts_container_node.xpath(layer_description["web_xpath"]) full_description["web"] = "" if web_node_container != []: web_node = web_node_container[0] print web_node.text full_description["web"] = web_node.text text = "" # text_node_container = contacts_container_node.xpath(layer_description['text_xpath']) text_node_container = tree.xpath(layer_description["text_xpath"]) if text_node_container != []: text_node = text_node_container[0] text = text = text_node.text for text_part_node in text_node.xpath(layer_description["text_other_parts_xpath"]): if text_part_node.tail != None: text = text + " " + text_part_node.tail print text full_description["text"] = text social_shares_container = tree.xpath(layer_description["social_shares_container_xpath"]) if social_shares_container != []: social_shares_node = social_shares_container[0] facebook_container = social_shares_node.xpath(layer_description["facebook_xpath"]) if facebook_container != []: print facebook_container[0].attrib["href"] full_description["facebook_link"] = facebook_container[0].attrib["href"] twitter_container = social_shares_node.xpath(layer_description["twitter_xpath"]) if twitter_container != []: print twitter_container[0].attrib["href"] full_description["twitter_link"] = twitter_container[0].attrib["href"] google_container = social_shares_node.xpath(layer_description["google_xpath"]) if google_container != []: print google_container[0].attrib["href"] full_description["gplus_link"] = google_container[0].attrib["href"] """ location_container = tree.xpath(layer_description['location_container_xpath']) if location_container != []: location_node = location_container[0] latitude_container = location_node.xpath(layer_description['latitude_xpath']) if latitude_container != []: latitude_value = latitude_container[0].get('value') if latitude_value != None: print latitude_value full_description['latitude'] = latitude_value longitude_container = location_node.xpath(layer_description['longitude_xpath']) if longitude_container != []: longitude_value = longitude_container[0].get('value') if longitude_value != None: print longitude_value full_description['longitude'] = longitude_value """ latitude_container = tree.xpath(layer_description["latitude_xpath"]) if latitude_container != []: latitude_value = latitude_container[0].get("value") if latitude_value != None: print latitude_value full_description["latitude"] = latitude_value longitude_container = tree.xpath(layer_description["longitude_xpath"]) if longitude_container != []: longitude_value = longitude_container[0].get("value") if longitude_value != None: print longitude_value full_description["longitude"] = longitude_value img_container_list = tree.xpath(layer_description["img_container_xpath"]) if img_container_list != []: img_container = img_container_list[0] img_num = 0 img_list = list() for img_node in img_container.xpath(layer_description["img_node_xpath"]): img_url = img_node.attrib.get("full") print img_url if img_url != None: if img_url != "": img_list.append(img_url) img_file_name = "%s_%s" % (str(venue_id), str(img_num)) img_num = img_num + 1 img_download(img_url, img_file_name, img_dir, exceptions_log_file) full_description["img_urls"] = img_list short_description = items_urls_and_descriptions[local_url_part]["short_description"] result_dict[local_url_part] = {"short_description": short_description, "full_description": full_description} # print items_urls_and_descriptions[local_url_part] venue_id = venue_id + 1 return result_dict
def second_layer_processor(layer_description, exceptions_log_file): next_layer_urls_and_descriptions = dict() for place_url in layer_description["items_urls_and_descriptions"]: url = root_site_url + place_url print "\n" * 3, url html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser) for container_xpath in layer_description["info_containers_xpaths"]: info_container = tree.xpath(container_xpath) if info_container != []: id_attrib = info_container[0].xpath("input")[0].attrib["id"] variable_id_first_part = id_attrib[ len(layer_description["info_tag_id_head"]) : id_attrib.find( layer_description["input_tag_id_permanent_component"] ) ] input_field_id_permanent_part = ( layer_description["info_tag_id_head"] + variable_id_first_part + layer_description["input_tag_id_permanent_component"] ) id_attrib_tail = id_attrib[len(input_field_id_permanent_part) :] variable_id_second_part = id_attrib_tail[0 : id_attrib_tail.find("_")] input_field_id_permanent_part = input_field_id_permanent_part + variable_id_second_part + "_" for tab_num in [1, 2, 3, 4]: # for tab_num in [1]: values_dict = second_layer_info_container_processor( info_container[0], input_field_id_permanent_part, layer_description["info_tag_xpathes_dict"], tab_num, ) # print values_dict headers_json["Referer"] = url response_json = second_layer_get_json(url_json, headers_json, values_dict) total_items, urls_and_short_descriptions_dict = second_layer_json_processor(response_json) # print urls_and_short_descriptions_dict for url, short_descriptions_list in urls_and_short_descriptions_dict.items(): # print url if next_layer_urls_and_descriptions.has_key(url): next_layer_urls_and_descriptions[url]["short_description"].extend(short_descriptions_list) else: next_layer_urls_and_descriptions[url] = { "short_description": short_descriptions_list, "full_description": {}, } pages_num = total_items / page_size if total_items % page_size > 0: pages_num = pages_num + 1 for page_num in range(2, pages_num + 1): values_dict["pageIndex"] = str(page_num) print values_dict response_json = second_layer_get_json(url_json, headers_json, values_dict) _, urls_and_short_descriptions_dict = second_layer_json_processor(response_json) for url, short_descriptions_list in urls_and_short_descriptions_dict.items(): print url if next_layer_urls_and_descriptions.has_key(url): next_layer_urls_and_descriptions[url]["short_description"].extend( short_descriptions_list ) else: next_layer_urls_and_descriptions[url] = { "short_description": short_descriptions_list, "full_description": {}, } for description in short_descriptions_list: print description # print total_items for url, descript in next_layer_urls_and_descriptions.items(): print url, descript print len(next_layer_urls_and_descriptions.items()) return next_layer_urls_and_descriptions