Exemplos de get_html em Python, exemplos de utility.get_html em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: fundays_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def second_layer_processor(layer_description, exceptions_log_file):
    next_layer_url_dict = dict()
    print 'From second_layer_processor, layer_description: ', layer_description
    info_container_xpath = layer_description['info_container_xpath']
    info_tag_xpath = layer_description['info_tag_xpath']
    for category_url, category_name in layer_description['items_urls_and_descriptions'].items():
    #for category_url in layer_description['items_urls_and_descriptions']:
        url = '%s%s' % (root_site_url, category_url)
        print 'second_layer_processor, root_site_url/category_url: ', url
        #info_items_set = set() 
        #html = get_html(url, user_agent_header)
        html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
        tree = etree.parse(StringIO(html), parser)
        info_container = tree.xpath(info_container_xpath)
        #tree.write('fundays_second_layer_pretty.html', pretty_print=True)

        if info_container != []:
            print 'second_layer_processor, info_container != []'
            info_items = info_container[0].xpath(info_tag_xpath)[::2]
            for info_item in info_items:
                print 'second_layer_processor, info_item.attrib[href]: ',  info_item.attrib['href']
                #info_items_set.add(info_item.attrib['href'])
                if next_layer_url_dict.has_key(info_item.attrib['href']):
                    next_layer_url_dict[info_item.attrib['href']]['categories'].append(category_name)
                else:
                    next_layer_url_dict[info_item.attrib['href']] = {'categories' : [category_name]}
            else:
                print 'second_layer_processor, info_container == []'
            print 'second_layer_processor, next_layer_url_dict: ', next_layer_url_dict
            other_pages_list_container = info_container[0].xpath(layer_description['other_category_pages_xpath'])
            print 'second_layer_processor, other_pages_list_container: ', other_pages_list_container
            for page_container in other_pages_list_container:
                print 'second_layer_processor, page_container: ', page_container 
                if page_container.text != None and page_container.text.strip() == 'Go to Page:':
                    print 'second_layer_processor, page_container.text.strip(): ',  page_container.text.strip()
                    print 'second_layer_processor, len(page_container.getchildren())', len(page_container.getchildren())
                    next_page_index = 2
                    print 'second_layer_processor, next_page_index: ', next_page_index
                    for child in page_container.getchildren()[1:]:
                        print 'second_layer_processor, next page url: %s-%s' % (url, str(next_page_index))
                        urls_from_next_page = second_layer_single_page_processor('%s-%s' % (url, str(next_page_index)), info_container_xpath, info_tag_xpath, exceptions_log_file)
                        print 'second_layer_processor, urls_from_next_page: ', urls_from_next_page
                        for url_from_next_page in urls_from_next_page:
                            print 'second_layer_processor, url_from_next_page: ', url_from_next_page
                            if next_layer_url_dict.has_key(url_from_next_page):
                                #next_layer_url_dict[url_from_next_page].append(category_name)
                                next_layer_url_dict[url_from_next_page]['categories'].append(category_name)
                            else:
                                #next_layer_url_dict[url_from_next_page] = [category_name]
                                next_layer_url_dict[url_from_next_page] = {'categories' : [category_name]}

                        #next_layer_url_set = next_layer_url_set.union(second_layer_single_page_processor('%s-%s' % (url, str(next_page_index)), info_container_xpath, info_tag_xpath))
                        next_page_index = next_page_index + 1
        else:
            print 'info_container == []'
    return next_layer_url_dict

Exemplo n.º 2

0

Exibir arquivo

Arquivo: tbNiceBot.py Projeto: ArchangelSDY/tbNiceBot

def get_topic_list(board_url):
	page_source = get_html(board_url.decode('utf8'))

	# Hack quotation marks
	page_source = page_source.replace('&quot;', '\"')

	if page_source is not None:
		topic_parser = TopicParser()
		return topic_parser.feed(page_source)
	else:
		return None

Exemplo n.º 3

0

Exibir arquivo

Arquivo: menupages_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def fourth_layer_media_processor(url, img_xpath, venue_id, exceptions_log_file):
    media_url = '%s/media' % url
    html = get_html(media_url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
    tree = etree.parse(StringIO(html), parser)
    img_container = tree.xpath(img_xpath)
    img_url_list = [img_tag.attrib['src'] for img_tag in img_container]
    img_num = 0
    for img_url in img_url_list:
        img_file_name = '%s_%s' % (str(venue_id), str(img_num))
        img_num = img_num + 1
        img_download(img_url, img_file_name, img_dir, exceptions_log_file)
    return img_url_list

Exemplo n.º 4

0

Exibir arquivo

Arquivo: menupages_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def first_layer_processor(layer_description, exceptions_log_file):
    next_layer_url_list = list()
    for url in layer_description['url_list']:
        html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(html), parser)

        info_container = tree.xpath(layer_description['info_container_xpath'])
        info_items = info_container[0].xpath(layer_description['info_tag_xpath'])
        for info_item in info_items:
           next_layer_url_list.append(info_item.attrib['href'])
    print 'From first layer processor: next_url_list length %s, next_url_list %s' % (len(next_layer_url_list), next_layer_url_list)
    return next_layer_url_list

Exemplo n.º 5

0

Exibir arquivo

Arquivo: fundays_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def first_layer_processor(layer_description, exceptions_log_file):
    next_layer_url_dict = dict()
    for url in layer_description['items_urls_and_descriptions'].keys():
        #html = get_html(url, user_agent_header)
        html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(html), parser)

        info_container = tree.xpath(layer_description['info_container_xpath'])
        #print info_container
        #print(etree.tostring(info_container[0], pretty_print=True))
        
        info_items = info_container[0].xpath(layer_description['info_tag_xpath'])
        for info_item in info_items:
            #next_layer_url_list.append(info_item.attrib['href'])
            #next_layer_url_list.append(info_item.attrib['href'])
            next_layer_url_dict[info_item.attrib['href']] = info_item.attrib['title']
        print 'From first layer processor: next_url_list length %s, next_url_list %s' % (len(next_layer_url_dict), next_layer_url_dict)
    return next_layer_url_dict

Exemplo n.º 6

0

Exibir arquivo

Arquivo: menupages_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def second_layer_processor(layer_description, exceptions_log_file):
    next_layer_url_list = list()
    for url in layer_description['url_list']:
        info_items = list()
        html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
        tree = etree.parse(StringIO(html), parser)
        info_container = tree.xpath(layer_description['info_container_all_xpath'])
        if info_container != []:
            info_items = info_container[0].xpath(layer_description['info_tag_xpath'])
        else:
            info_container = tree.xpath(layer_description['info_container_xpath'])
            if info_container != []:
                info_items = info_container[0].xpath(layer_description['info_tag_xpath'])
        if info_items != []:
            for info_item in info_items:
                next_layer_url_list.append(info_item.attrib['href'])
    print 'From second layer processor: next_url_list length %s' % len(next_layer_url_list)
    for url in next_layer_url_list:
        print url
    return next_layer_url_list

Exemplo n.º 7

0

Exibir arquivo

Arquivo: menupages_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def fourth_layer_location_processor(url, jscript_xpaths, exceptions_log_file):
    location_coordinates = list()
    coordinates_str_marker = 'LatLng('
    marker_len = len(coordinates_str_marker)
    coordinates_str_end_marker = ')'
    location_url = '%s/location/' % url
    html = get_html(location_url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
    tree = etree.parse(StringIO(html), parser)
    jscript_container = tree.xpath(jscript_xpaths)
    jscript_texts = [jscript.text for jscript in jscript_container]
    
    if len(jscript_texts) == 2:
        coordinates_jscript_text = jscript_texts[1]
        coordinates_substr_begin = coordinates_jscript_text.find(coordinates_str_marker)
        if coordinates_substr_begin != -1:
            coordinates_jscript_text = coordinates_jscript_text[coordinates_substr_begin + marker_len:]
            coordinates_substr_end = coordinates_jscript_text.find(coordinates_str_end_marker)
            if coordinates_substr_end != -1:
                location_coordinates = coordinates_jscript_text[:coordinates_substr_end].split(',')
    return location_coordinates

Exemplo n.º 8

0

Exibir arquivo

Arquivo: menupages_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def third_layer_processor(layer_description, exceptions_log_file):
    next_layer_url_list = list()
    i = 0
    for url in layer_description['url_list']:
        print 'Third layer outer cycle iteration: %s' % str(i)
        i = i + 1
        current_page_url = url
        j = 0
        is_next_button = True
        while is_next_button:
            print 'Third layer inner cycle iteration: %s' % str(j)
            j = j + 1
            print 'Current page url: ' + current_page_url
            html = get_html(current_page_url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
            print 'Third layer html', html
            tree = etree.parse(StringIO(html), parser)
            info_container = tree.xpath(layer_description['info_container_xpath'])
            info_items = info_container[0].xpath(layer_description['info_tag_xpath'])
            for info_item in info_items:
                next_layer_url_list.append(info_item.attrib['href'])
                print info_item.attrib['href']
            print 'Third layer, next layer url list len: %s' % str(len(next_layer_url_list))
            next_page_item = info_container[0].xpath(layer_description['next_button_xpath'])
            if next_page_item != []:
                print 'Next page item list not empty'
                print 'Next page button id ' + next_page_item[0].attrib['id']
                if next_page_item[0].attrib['id'] == 'ctl00_cphMain_btnNext':
                    is_next_button = True
                    current_page_url = next_page_item[0].attrib['href']
                    print 'Next button url:' + current_page_url
                else:
                    print 'None next button'
                    is_next_button = False
            else:
                is_next_button = False
                print 'Next page item list empty'
    return next_layer_url_list

Exemplo n.º 9

0

Exibir arquivo

Arquivo: fundays_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def second_layer_single_page_processor(url, info_container_xpath, info_tag_xpath, exceptions_log_file):
    print 'Entry in second_layer_single_page_processor'
    urls_list = list()
    print 'second layer single page url: ', url
    html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
    #html = get_saved_content('fundays_second_layer_requests.txt')
    #print 'second_layer_single_page_processor, html: ', html.encode('utf-8')
    tree = etree.parse(StringIO(html), parser)
    #info_container_xpath = '/html/body/div[@class="container"]/div[@class="wrapper"]/table/tr/td[@class="pagebg"]/table/tr/td[@class="mid_pad"]/table[@class="list_detail"]/tr/td/table'
    #info_tag_xpath = 'tr/td/div/table/tr/td/a'

    info_container = tree.xpath(info_container_xpath)
    #print(etree.tostring(info_container[0], pretty_print=True))
    print 'second_layer_single_page_processor, info_container: ', info_container
    if info_container != []:
        pass
        info_items = info_container[0].xpath(info_tag_xpath)[::2]
        #info_items = info_container[0].xpath(info_tag_xpath)
        for info_item in info_items:
            print 'second_layer_single_page_processor, info_item.attrib[\'href\']: ', info_item.attrib['href']
            #urls_set.add(info_item.attrib['href'])
            urls_list.append(info_item.attrib['href'])
    print 'second_layer_single_page_processor, urls_list: ', urls_list
    return urls_list

Exemplo n.º 10

0

Exibir arquivo

Arquivo: fundays_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def third_layer_data_slice_processor(args):
    layer_description, items_urls_and_descriptions, urls_list_slice, start_venue_id, exceptions_log_file = args
    result_dict = dict()
    venue_id = start_venue_id
    for url_local_part in urls_list_slice:
        url = root_site_url + '/' + url_local_part
        venue_description = dict()
        venue_description['id'] = venue_id

        html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
        #html = get_saved_content('fundays_third_layer_requests.txt')
        parser = etree.HTMLParser(remove_blank_text=True, strip_cdata=False)
        tree = etree.parse(StringIO(html), parser)
        #tree.write('fundays_third_layer_pretty.html', pretty_print=True)

        info_container_list = tree.xpath(layer_description['info_container_xpath'])
        #print info_container_list
        
        #address_node = info_container_list[0].xpath('//div[@itemprop="address"]')[0]
        #print get_xpath_to_root(address_node)

        print 'info_container_list: ', info_container_list
        if info_container_list != []:
            info_container = info_container_list[0]
            address_node_container = info_container.xpath(layer_description['address_node_xpath'])
            advert_text = ''
            if address_node_container != []:
                address_node = address_node_container[0]
                #print address_node
                if address_node.tail != None:
                    advert_text = advert_text + address_node.tail + ' '
                current_advert_node = address_node.getnext()
                while(current_advert_node != None and current_advert_node.tag != 'h3'):
                    if current_advert_node.text != None:
                        advert_text = advert_text + current_advert_node.text + ' '
                    if current_advert_node.tail != None:
                        advert_text = advert_text + current_advert_node.tail + ' '
                    strong_subnode_container = current_advert_node.xpath('strong')
                    if strong_subnode_container != []:
                        for strong_subnode in strong_subnode_container:
                            if strong_subnode.text != None:
                                advert_text = advert_text + strong_subnode.text + ' '
                            if strong_subnode.tail != None:
                                advert_text = advert_text + strong_subnode.tail + ' '
                    current_advert_node = current_advert_node.getnext()

                #print advert_text

                venue_description['advert_text'] = advert_text
                        
            contacts_header_list = info_container.xpath(layer_description['contacts_header_xpath'])
            for contacts_header in contacts_header_list:
                if 'Contact' in contacts_header.text:
                    #print contacts_header.text[len('Contact') + 1:]
                    venue_description['name'] = contacts_header.text[len('Contact') + 1:]
            phone_container = info_container.xpath(layer_description['phone_xpath'])
            if phone_container != []:
                phone = phone_container[0].text
                #print phone
                venue_description['phone'] = phone
            info_header_tags = info_container.xpath(layer_description['info_header_tag_xpath'])
            for info_header_tag in info_header_tags:
                header_text = info_header_tag.text.strip()
                if header_text == 'International:':
                    international_phone = info_header_tag.tail.strip()
                    #print international_phone
                    venue_description['international_phone'] = international_phone
                if header_text == 'Email:':
                    email = get_next_text(info_header_tag)
                    #print email
                    venue_description['email'] = email
                if header_text == 'Website:':
                    site = get_next_href(info_header_tag)
                    #print site
                    venue_description['site'] = site
                if header_text == 'Location Map:':
                    location_link = get_next_href(info_header_tag)
                    if location_link != None:
                        location_coordinates = location_link[location_link.find('=') + 1:].split(',')
                        #print location_coordinates
                        venue_description['location_coordinates'] = location_coordinates
            region_container = info_container.xpath(layer_description['region_address_xpath'])
            if region_container != []:
                venue_description['region'] = ''
                if region_container[0].text != None:
                    region = region_container[0].text.strip()
                    venue_description['region'] = region
                    #print region
            street_address_container = info_container.xpath(layer_description['street_address_xpath'])
            if street_address_container != []:
                street_address = ''
                if street_address_container[0].text != None:
                    street_address = street_address_container[0].text.strip()
                    #print street_address
                    venue_description['street_address'] = street_address

        list_detail_container = tree.xpath(layer_description['list_detail_xpath'])

        img_container_list = tree.xpath(layer_description['img_xpath'])
        print 'img_container_list: ', img_container_list
        img_num = 0
        img_list = list()
        for img_container in img_container_list:
            print 'img_container: ', img_container
            if img_container != None:
                img_url = img_container.attrib.get('src')
                #print img_url
                if img_url != None and img_url != '':
                    img_list.append(img_url)
                    img_file_name = '%s_%s' % (str(venue_id), str(img_num))
                    img_num = img_num + 1
                    img_download(root_site_url + img_url, img_file_name, img_dir, exceptions_log_file)
        venue_description['img_urls'] = img_list
        #items_urls_and_descriptions[url_local_part]['description'] = venue_description

        categories = items_urls_and_descriptions[url_local_part]['categories']
        #categories = {}

        #items_urls_and_descriptions[url_local_part].update({'description' : venue_description})
        #items_urls_and_descriptions[url_local_part] = {'categories' : categories, 'description' : venue_description}
        result_dict[url_local_part] = {'categories' : categories, 'description' : venue_description}
        #print venue_description
        venue_id = venue_id + 1
        print start_venue_id
        print result_dict[url_local_part]
    #return items_urls_and_descriptions
    #queue.put(result_dict)
    return result_dict

Exemplo n.º 11

0

Exibir arquivo

Arquivo: menupages_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def fourth_layer_data_slice_processor(args):
    layer_description, urls_list_slice, start_venue_id, exceptions_log_file = args
    venue_list = list()
    venue_id = start_venue_id
    for url in urls_list_slice:
        venue_description = dict()
        venue_description['id'] = venue_id
        
        info_items = list()
        html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)

        tree = etree.parse(StringIO(html), parser)
        info_container = tree.xpath(layer_description['info_container_xpath'])
        if info_container != []:
            contacts_items = info_container[0].xpath(layer_description['contacts_info_xpath'])
            contact_item = contacts_items[0]
            print contact_item.attrib['href']

            name_container = tree.xpath(layer_description['data_venue_name_xpath'])
            print name_container[0].text
            venue_description['name'] = name_container[0].text
            venue_description['category'] = 'food'
            data_items = info_container[0].xpath(layer_description['data_xpath'])
            for item in data_items:
                data_keyword_list = item.xpath(layer_description['data_keyword_xpath'])
                if data_keyword_list != []:
                    if data_keyword_list[0].text == 'CUISINES':
                        data_text_list = item.xpath(layer_description['data_text_xpath'])
                        print data_keyword_list[0].text
                        print data_text_list[0].tail
                        venue_description['cuisines'] = data_text_list[0].tail.strip().encode('utf-8').replace('\n', ' ')
                        cuisines_list = venue_description['cuisines'].split(',') 
                        venue_description['cuisines_main'] = cuisines_list[0]
                        venue_description['cuisines_rest'] = cuisines_list[1:]
                    if data_keyword_list[0].text == 'WEBSITE':
                        web_text_list = item.xpath(layer_description['data_web_xpath'])
                        print data_keyword_list[0].text
                        print web_text_list[0].attrib['href']
                        venue_description['website'] = web_text_list[0].attrib['href']
            
            contacts_page_xpaths = layer_description['contacts_page_xpaths']
            contacts_page_html = get_html(root_site_url + contact_item.attrib['href'], user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
            contacts_page_tree = etree.parse(StringIO(contacts_page_html), parser)
            contacts_page_info_container_list = contacts_page_tree.xpath(contacts_page_xpaths['info_container_xpath'])

            if contacts_page_info_container_list != []:
                contacts_page_info_subcontainers_list = contacts_page_info_container_list[0].xpath(contacts_page_xpaths['info_subcontainer_xpath'])
                for subcontainer in contacts_page_info_subcontainers_list:
                    subcontainer_name = subcontainer.xpath(contacts_page_xpaths['subcontainer_name_xpath'])[0].text
                    subcontainer_data = subcontainer.xpath(contacts_page_xpaths['subcontainer_data_xpath'])[0].text
                    if subcontainer_name == 'Address':
                        print 'Address: ', subcontainer_data
                        venue_description['address'] = subcontainer_data
                        venue_description['address_components'] = subcontainer_data.split(', ')
                        for address_component in venue_description['address_components']:
                            print address_component
                    if subcontainer_name == 'Telephone':
                        print 'Phone: ', subcontainer_data
                        venue_description['phone'] = subcontainer_data
            media_url_list = fourth_layer_media_processor(url, layer_description['img_xpath'], venue_id, exceptions_log_file)
            venue_description['media_url_list'] = media_url_list
            for media_url in venue_description['media_url_list']:
                print media_url
            venue_advert_text_container = tree.xpath(layer_description['venue_advert_text_xpath'])
            if venue_advert_text_container != []:
                venue_description['venue_advert_text'] = venue_advert_text_container[0].text
                print venue_description['venue_advert_text']
            location_coordinates = fourth_layer_location_processor(url, layer_description['location_xpath'], exceptions_log_file)
            venue_description['location_coordinates'] = location_coordinates
            print location_coordinates
            venue_list.append(venue_description)
            if len(venue_list) != 0:
                print venue_list[-1]
            else:
                print 'venue_list empty'
        else:
            print 'Fourth layer info container is empty list', 'url: ' + url
        venue_id = venue_id + 1
    return venue_list

Exemplo n.º 12

0

Exibir arquivo

Arquivo: account.py Projeto: ArchangelSDY/tbNiceBot

	def delete_topic(self, topic_url):
		try:
			page_source = get_html(topic_url, Admin.COOKIE_PATH).encode('utf8')

			tbs = re.search('tbs\:\"(.+)\"', page_source).groups()[0]

			kw = re.search('forum_name\:\"(.+?)\"', page_source).groups()[0]

			fid = re.search('fid\:\'(.+?)\'', page_source).groups()[0]

			tid = re.search('tid\:\'(.+?)\'', page_source).groups()[0]

			post_params = [
				('ie', 'utf-8'),
				('tbs', tbs),
				('kw', kw),
				('fid', fid),
				('tid', tid),
			]

			curl = pycurl.Curl()
			curl.setopt(pycurl.URL, settings.TOPIC_DELETE_POINT)

			# Set referer
			curl.setopt(pycurl.REFERER, str(topic_url))

			# Ignore SSL.
			curl.setopt(pycurl.SSL_VERIFYPEER, False)

			# Follow redirection.
			curl.setopt(pycurl.FOLLOWLOCATION, True)

			# Set user agent
			curl.setopt(pycurl.USERAGENT, settings.USER_AGENT)

			# POST
			curl.setopt(pycurl.POST, 1)
			curl.setopt(pycurl.POSTFIELDS, urllib.urlencode(post_params))

			# Set custom header
			custom_header = [
				'X-Requested-With: XMLHttpRequest',
				'DNT: 1',
				'Accept:  application/json, text/javascript, */*; q=0.01',
    			'Accept-Language:  en-us,en;q=0.5',
    			'Accept-Encoding:  gzip, deflate',
    			'Pragma:  no-cache',
    			'Cache-Control:  no-cache',
    			'HeaderEnd: CRLF',
    			'Content-Type: application/x-www-form-urlencoded; charset=UTF-8',
			]
			curl.setopt(pycurl.HTTPHEADER, custom_header)

			# Set cookie file
			curl.setopt(pycurl.COOKIEFILE , Admin.COOKIE_PATH)

			# Set content buffer
			content = StringIO.StringIO()
			curl.setopt(pycurl.WRITEFUNCTION, content.write)

			# Set header buffer
			header = StringIO.StringIO()
			curl.setopt(pycurl.HEADERFUNCTION, header.write)

			curl.perform()

		except Exception, e:
			log(unicode(traceback.format_exc()))
			return

Exemplo n.º 13

0

Exibir arquivo

Arquivo: discoverireland_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def third_layer_data_slice_processor(args):
    result_dict = dict()
    layer_description, items_urls_and_descriptions, urls_list_slice, start_venue_id, exceptions_log_file = args
    venue_id = start_venue_id
    for local_url_part in urls_list_slice:
        full_description = dict()
        full_description["id"] = venue_id
        url = root_site_url + local_url_part
        html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(html), parser)

        contacts_container = tree.xpath(layer_description["contacts_container_xpath"])
        name_container = tree.xpath(layer_description["name_xpath"])
        if name_container != []:
            name_node = name_container[0]
            full_description["name"] = name_node.text
        if contacts_container != []:
            print contacts_container
            contacts_container_node = contacts_container[0]
            address_components = contacts_container_node.xpath(layer_description["address_components_xpath"])
            address = ""
            if address_components != []:
                for address_component in address_components:
                    address = address + address_component.text + " "
            print address
            full_description["address"] = address
            phone_node_container = contacts_container_node.xpath(layer_description["phone_xpath"])
            full_description["phone"] = ""
            if phone_node_container != []:
                phone_node = phone_node_container[0]
                print phone_node.text
                full_description["phone"] = phone_node.text
            email_node_container = contacts_container_node.xpath(layer_description["email_xpath"])
            full_description["email"] = ""
            if email_node_container != []:
                email_node = email_node_container[0]
                print email_node.text
                full_description["email"] = email_node.text
            web_node_container = contacts_container_node.xpath(layer_description["web_xpath"])
            full_description["web"] = ""
            if web_node_container != []:
                web_node = web_node_container[0]
                print web_node.text
                full_description["web"] = web_node.text
            text = ""
            # text_node_container = contacts_container_node.xpath(layer_description['text_xpath'])
        text_node_container = tree.xpath(layer_description["text_xpath"])
        if text_node_container != []:
            text_node = text_node_container[0]
            text = text = text_node.text
            for text_part_node in text_node.xpath(layer_description["text_other_parts_xpath"]):
                if text_part_node.tail != None:
                    text = text + " " + text_part_node.tail
            print text
            full_description["text"] = text
        social_shares_container = tree.xpath(layer_description["social_shares_container_xpath"])
        if social_shares_container != []:
            social_shares_node = social_shares_container[0]
            facebook_container = social_shares_node.xpath(layer_description["facebook_xpath"])
            if facebook_container != []:
                print facebook_container[0].attrib["href"]
                full_description["facebook_link"] = facebook_container[0].attrib["href"]
            twitter_container = social_shares_node.xpath(layer_description["twitter_xpath"])
            if twitter_container != []:
                print twitter_container[0].attrib["href"]
                full_description["twitter_link"] = twitter_container[0].attrib["href"]
            google_container = social_shares_node.xpath(layer_description["google_xpath"])
            if google_container != []:
                print google_container[0].attrib["href"]
                full_description["gplus_link"] = google_container[0].attrib["href"]
                """
        location_container = tree.xpath(layer_description['location_container_xpath'])
        if location_container != []:
            location_node = location_container[0]
            latitude_container = location_node.xpath(layer_description['latitude_xpath'])
            if latitude_container != []:
                latitude_value = latitude_container[0].get('value')
                if latitude_value != None:
                    print latitude_value
                    full_description['latitude'] = latitude_value
            longitude_container = location_node.xpath(layer_description['longitude_xpath'])
            if longitude_container != []:
                longitude_value = longitude_container[0].get('value')
                if longitude_value != None:
                    print longitude_value
                    full_description['longitude'] = longitude_value
                    """
        latitude_container = tree.xpath(layer_description["latitude_xpath"])
        if latitude_container != []:
            latitude_value = latitude_container[0].get("value")
            if latitude_value != None:
                print latitude_value
                full_description["latitude"] = latitude_value
        longitude_container = tree.xpath(layer_description["longitude_xpath"])
        if longitude_container != []:
            longitude_value = longitude_container[0].get("value")
            if longitude_value != None:
                print longitude_value
                full_description["longitude"] = longitude_value

        img_container_list = tree.xpath(layer_description["img_container_xpath"])
        if img_container_list != []:
            img_container = img_container_list[0]
            img_num = 0
            img_list = list()
            for img_node in img_container.xpath(layer_description["img_node_xpath"]):
                img_url = img_node.attrib.get("full")
                print img_url
                if img_url != None:
                    if img_url != "":
                        img_list.append(img_url)
                        img_file_name = "%s_%s" % (str(venue_id), str(img_num))
                        img_num = img_num + 1
                        img_download(img_url, img_file_name, img_dir, exceptions_log_file)
            full_description["img_urls"] = img_list

        short_description = items_urls_and_descriptions[local_url_part]["short_description"]
        result_dict[local_url_part] = {"short_description": short_description, "full_description": full_description}
        # print items_urls_and_descriptions[local_url_part]
        venue_id = venue_id + 1
    return result_dict

Exemplo n.º 14

0

Exibir arquivo

Arquivo: discoverireland_multiprocessing.py Projeto: contemplat0r/IrishFunforallScraper

def second_layer_processor(layer_description, exceptions_log_file):
    next_layer_urls_and_descriptions = dict()
    for place_url in layer_description["items_urls_and_descriptions"]:
        url = root_site_url + place_url
        print "\n" * 3, url
        html = get_html(url, user_agent_header, post_data={}, exceptions_log_file=exceptions_log_file)
        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(html), parser)
        for container_xpath in layer_description["info_containers_xpaths"]:
            info_container = tree.xpath(container_xpath)
            if info_container != []:
                id_attrib = info_container[0].xpath("input")[0].attrib["id"]
                variable_id_first_part = id_attrib[
                    len(layer_description["info_tag_id_head"]) : id_attrib.find(
                        layer_description["input_tag_id_permanent_component"]
                    )
                ]
                input_field_id_permanent_part = (
                    layer_description["info_tag_id_head"]
                    + variable_id_first_part
                    + layer_description["input_tag_id_permanent_component"]
                )
                id_attrib_tail = id_attrib[len(input_field_id_permanent_part) :]
                variable_id_second_part = id_attrib_tail[0 : id_attrib_tail.find("_")]
                input_field_id_permanent_part = input_field_id_permanent_part + variable_id_second_part + "_"
                for tab_num in [1, 2, 3, 4]:
                    # for tab_num in [1]:
                    values_dict = second_layer_info_container_processor(
                        info_container[0],
                        input_field_id_permanent_part,
                        layer_description["info_tag_xpathes_dict"],
                        tab_num,
                    )
                    # print values_dict
                    headers_json["Referer"] = url
                    response_json = second_layer_get_json(url_json, headers_json, values_dict)
                    total_items, urls_and_short_descriptions_dict = second_layer_json_processor(response_json)
                    # print urls_and_short_descriptions_dict
                    for url, short_descriptions_list in urls_and_short_descriptions_dict.items():
                        # print url
                        if next_layer_urls_and_descriptions.has_key(url):
                            next_layer_urls_and_descriptions[url]["short_description"].extend(short_descriptions_list)
                        else:
                            next_layer_urls_and_descriptions[url] = {
                                "short_description": short_descriptions_list,
                                "full_description": {},
                            }

                    pages_num = total_items / page_size
                    if total_items % page_size > 0:
                        pages_num = pages_num + 1
                    for page_num in range(2, pages_num + 1):
                        values_dict["pageIndex"] = str(page_num)
                        print values_dict
                        response_json = second_layer_get_json(url_json, headers_json, values_dict)
                        _, urls_and_short_descriptions_dict = second_layer_json_processor(response_json)
                        for url, short_descriptions_list in urls_and_short_descriptions_dict.items():
                            print url
                            if next_layer_urls_and_descriptions.has_key(url):
                                next_layer_urls_and_descriptions[url]["short_description"].extend(
                                    short_descriptions_list
                                )
                            else:
                                next_layer_urls_and_descriptions[url] = {
                                    "short_description": short_descriptions_list,
                                    "full_description": {},
                                }
                            for description in short_descriptions_list:
                                print description
                    # print total_items
    for url, descript in next_layer_urls_and_descriptions.items():
        print url, descript
    print len(next_layer_urls_and_descriptions.items())
    return next_layer_urls_and_descriptions