Exemplo n.º 1
0
 def parse(self, response):
     sel = Selector(response)
     locations = Locations()
     locations["restaurantIDs"] = sel.xpath('//a/@data-id').extract()
     locations["coordinates"] = {}
     locations["coordinates"]["longitude"] = self.coordinatesURLTranslator.getLongitude(response.url)
     locations["coordinates"]["latitude"] = self.coordinatesURLTranslator.getLatitude(response.url)
     return locations
Exemplo n.º 2
0
    def parse_item(self, response):
        index = response.meta['index']
        if index == 1:
            index_count = response.selector.xpath('//*[@id="m-page"]/span/text()').extract()
            index_count = [x.strip() for x in index_count if x.strip()]
            index, count = [int(x) for x in index_count[0].split('/')]
            for i in range(index + 1, count + 1):
                yield Request(url=self.get_gn_url(i), headers=TONGHUASHUN_GN_HEADER,
                              meta={'index': i},
                              callback=self.parse_item)

        trs = response.xpath('/html/body/table/tbody//tr').extract()

        try:
            for tr in trs:
                start_date = Selector(text=tr).xpath('//td[1]/text()').extract_first()
                name = Selector(text=tr).xpath('//td[2]/a/text()').extract_first()
                link = Selector(text=tr).xpath('//td[2]/a/@href').extract_first()
                news_title = Selector(text=tr).xpath('//td[3]/a/text()').extract_first()
                news_link = Selector(text=tr).xpath('//td[3]/a/@href').extract_first()
                leadings = [x.rsplit('/')[-2] for x in Selector(text=trs[0]).xpath('//td[4]/a/@href').extract()]
                count = Selector(text=tr).xpath('//td[5]/text()').extract()
                yield SectorItem(id='{}_{}_{}'.format('10jqka', 'gn', name), start_date=start_date, name=name,
                                 link=link, news_title=news_title, news_link=news_link, leadings=leadings, count=count,
                                 producer='10jqka', type='gn')
        except Exception as e:
            self.logger.error('error parse 10jqka gainian sector url:{} {}'.format(response.url, e))
Exemplo n.º 3
0
    def parse_XML(self, response):
        if not hasattr(self, 'parse_node'):
            raise NotConfigured('You must define parse_node method in order to scrape this XML feed')
        response = self.adapt_response(response)
        if self.iterator == 'iternodes':
            nodes = self._iternodes(response)
        elif self.iterator == 'xml':
            selector = Selector(response, type='xml')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        elif self.iterator == 'html':
            selector = Selector(response, type='html')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes)
Exemplo n.º 4
0
    def detail_parse(self, response):
        page = response.meta['page']
        token = json.loads(requests.post(self.token_url,
                                         headers=self.header).text,
                           strict=False).get('d', '')
        data = copy.deepcopy(self.data)
        data.update({'Token': token, 'PageIndex': str(page)})
        list_content = json.loads(requests.post(self.list_url,
                                                headers=self.header,
                                                json=data).text,
                                  strict=False).get('d', '')
        cont_list = json.loads(list_content).get('Table', [])
        for cont in cont_list:
            result_dict = {}
            info_id = cont.get('InfoID', '')
            post_data = {
                "Token":
                json.loads(requests.post(self.token_url,
                                         headers=self.header).text,
                           strict=False).get('d', ''),
                "PageIndex":
                "1",
                "PageSize":
                "1",
                "InfoID":
                info_id
            }
            detail_content = json.loads(requests.post(self.detail_url,
                                                      headers=self.header,
                                                      json=post_data).text,
                                        strict=False).get('d', '')
            detail = json.loads(detail_content, strict=False).get('Table',
                                                                  [])[0]

            result_dict['punish_code'] = detail.get('name1', '')
            result_dict['case_name'] = detail.get('name2', '')
            result_dict['punish_category_one'] = detail.get('name3', '')
            result_dict['punish_category_two'] = detail.get('name4', '')
            result_dict['punish_type'] = detail.get('name5', '')
            result_dict['punish_basis'] = detail.get('name6', '')
            result_dict['company_name'] = detail.get('name7', '')
            result_dict['credit_code'] = detail.get('name8', '')
            result_dict['organization_code'] = detail.get('name9', '')
            result_dict['regno'] = detail.get('name10', '')
            result_dict['tax_code'] = detail.get('name11', '')
            result_dict['id_number'] = detail.get('name12', '')
            result_dict['frname'] = detail.get('name13', '')
            result_dict['punish_content'] = detail.get('name14', '')
            result_dict['public_date'] = detail.get('name15', '')
            result_dict['punish_org'] = detail.get('name16', '')
            result_dict['update'] = detail.get('infodate', '')
            for key, value in result_dict.items():
                result_dict[key] = ''.join(Selector(text=value).xpath('//p//text()').extract()).strip()\
                    if '<p style' in value else value
            yield self.handle_result(response, result_dict, info_id)
Exemplo n.º 5
0
 def parse_store(self, response, js):
     props = {}
     props["addr_full"] = Selector(text=js["address"]).xpath("//p/text()").get()
     props["ref"] = js["url_title"]
     props["lat"] = js["coordinates"][0]
     props["lon"] = js["coordinates"][1]
     props["city"] = js["city"]
     props["state"] = js["state"]
     props["postcode"] = js["zip"]
     props["phone"] = js["phone_number"]
     hours = response.css(".hours p:not(:empty)").xpath("text()").get()
     props["opening_hours"] = hours
     return GeojsonPointItem(**props)
Exemplo n.º 6
0
 def parse(self, response):
     sel = Selector(response)
     restaurants = sel.xpath('//a[contains(@id, "establecimiento")]')
     for restaurant in restaurants:
         locationCsv = LocationCsv()
         locationCsv["id_restaurante"] = restaurant.css(
             "a::attr(data-id)").extract()
         locationCsv["nombre_restaurante"] = restaurant.css(
             "a .result-info h4::text").extract()
         locationCsv["latitud"] = self.coordinatesURLTranslator.getLatitude(
             response.url)
         locationCsv[
             "longitud"] = self.coordinatesURLTranslator.getLongitude(
                 response.url)
         yield locationCsv
Exemplo n.º 7
0
    def _validate_response(self, response: Union[Response, str]) -> bool:
        """

        :param response:
        :type response: Response
        :return:
        :rtype: bool
        """
        if isinstance(response, str):
            response: Selector = Selector(text=response)

        response: Union[Response, Selector]
        names_in_meta: List[str] = response.xpath("/html/head/meta").xpath(
            "@name").extract()

        return "ROBOTS" not in names_in_meta
Exemplo n.º 8
0
    def parse_quick_facts(self, selector: Selector, quest: Quest):
        """
        parses the quick facts section on a wowhead quest page

        :param selector: selector of the quick facts section
        :param quest: quest item to store gathered info in
        :return:
        """
        result = selector.re(r"Start:\s(.*</a>)")
        if result:
            element = Selector(text=result[0])
            quest["npc"] = element.xpath("//a/text()").get()
            quest["npc_link"] = self.base_url + element.xpath(
                "//a/@href").get()
        else:
            quest["npc"] = "Unknown"
            quest["npc_link"] = "Unknown"
Exemplo n.º 9
0
    def parse(self, response):
        sel = Selector(response)
        sites = sel.xpath('//div[@class="mainleft"]')
        itemlist= []
        
        for site in sites:
            item = CnkispiderItem()
            
            title = site.xpath('//*[@id="chTitle"]/text()').extract()
            #将相应的值填入到item对应的属性中去
            item['title'] = [t.encode('utf-8') for t in title] 
            author = site.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/p[1]/a/text()').extract()
            if author == None:
                author = site.xpath('//*[@id="content"]/div[1]/div[2]/p[1]/a/text()').extract()
            item['author'] = [a.encode('utf-8') for a in author]
            institution = site.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/p[3]/a/text()').extract()
            item['institution'] = [i.encode('utf-8') for i in institution]
            abstract = site.xpath('//*[@id="ChDivSummary"]/text()').extract()
            item['abstract'] = [a.encode('utf-8') for a in abstract]
            keyWord = site.xpath('//*[@id="ChDivKeyWord"]/a/text()').extract()
            item['keyWord'] = [k.encode('utf-8') for k in keyWord]
            downloadFreq = site.xpath('//*[@id="content"]/div[1]/div[5]/ul/li/text()').re(u'\s*【下载频次】(.*)')
            item['downloadFreq'] = [d.encode('utf-8') for d in downloadFreq]
            quoteFreq = site.xpath('//*[@id="rc3"]/text()').re('\W(\d+)\W')
            item['quoteFreq'] = [q.encode('utf-8') for q in quoteFreq]
            
            itemlist.append(item)
            
            #加入日志记录,级别为info
            log.msg("Appending item...", level=log.INFO)
        #生成日志
        log.msg("Append done.", level=log.INFO)
        return itemlist



# if __name__ == "__main__":
#     sys.path.append('F:\Pythonworkspace\cnkiSpider_master\cnkiSpider\cnkiSpider')
#     cnki = CNKI_Spiders()
# #     print os.getcwd()
#     print cnki
#         
Exemplo n.º 10
0
 def test_parsel_parse_and_extract(self):
     for i in range(ITERATIONS):
         for name, page in ibl_pages.items():
             s = Selector(text=page.body)
             extract(parsel_extractors[name], s)
Exemplo n.º 11
0

schema = FakeContainer(descriptors['#default'])
validate = schema._validate_and_adapt_item
_names_map = {'daft_ie': 'daft', 'patchofland': 'pol'}
ibl_extractors = {}
ibl_pages = {}
selector_pages = {}
for template_name in ('daft_ie', 'hn', 'patchofland'):
    with open('%s/data/templates/%s.html' % (_PATH, template_name)) as f:
        html_page = HtmlPage(body=f.read().decode('utf-8'))
        name = _names_map.get(template_name, template_name)
        ibl_pages[name] = html_page
        ibl_extractors[name] = SlybotIBLExtractor([(html_page, descriptors,
                                                    '0.13.0')])
        selector_pages[name] = Selector(text=html_page.body)


class TestExtractionSpeed(TestCase):
    def test_parsel_parse_and_extract(self):
        for i in range(ITERATIONS):
            for name, page in ibl_pages.items():
                s = Selector(text=page.body)
                extract(parsel_extractors[name], s)

    def test_slybot_parse_and_extract(self):
        for i in range(ITERATIONS):
            for name, page in ibl_pages.items():
                extraction_page = HtmlPage(body=page.body)
                ibl_extractors[name].extract(extraction_page)
Exemplo n.º 12
0
 def extractData(self, body, xpath):
     if isinstance(body, str):
         return Selector(text=body).xpath(xpath).extract()
     return Selector(response=body).xpath(xpath).extract()
Exemplo n.º 13
0
 def __init__(self, response):
     self.sel = Selector(response)