Exemplo n.º 1
0
    def parse(self, response):
        self.driver.get(response.url)
        # self.driver.implicitly_wait(30)
        selector = Selector(text=self.driver.page_source)

        for sel in selector.xpath('//*[@id="J_Counter"]'):
            item = TbItem()
            #item['item_name'] = sel.xpath('//*[@id="J_Title"]/h3/text()').extract_first()
            item['item_name'] = sel.xpath(
                '//*[@id="J_Title"]/h3/@data-title').extract_first().replace(
                    ",", ",").replace("\n", "。")
            item['item_id'] = parse_qs(urlparse(response.url).query,
                                       True)['id'][0].replace(",",
                                                              ",").replace(
                                                                  "\n", "。")
            item['comments'] = sel.xpath(
                '//*[@id="J_RateCounter"]/text()').extract_first().replace(
                    ",", ",").replace("\n", "。")
            item['trade'] = sel.xpath(
                '//*[@id="J_SellCounter"]/text()').extract_first().replace(
                    ",", ",").replace("\n", "。")
            item['price'] = sel.xpath(
                '//*[@id="J_StrPrice"]/em[2]/text()').extract_first().replace(
                    ",", ",").replace("\n", "。")
            yield item
Exemplo n.º 2
0
def crawl_ips():
    headers = {
        "user-agent":
        "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1"
    }
    for i in range(1, 2):  # 制定爬取的页数  这里只爬取西刺代理第1页
        response = requests.get("http://www.xicidaili.com/nn/{0}".format(i),
                                headers=headers)
        selector = Selector(text=response.text)
        all_trs = selector.xpath('//table[@id="ip_list"]//tr[position()>1]')
        ip_list = []
        for tr in all_trs:
            speed = tr.xpath(
                "./td[@class='country'][3]//@title").extract()[0].split('秒')[0]
            ip = tr.xpath("./td[2]/text()").extract()[0]
            port = tr.xpath("./td[3]/text()").extract()[0]
            type = tr.xpath("./td[6]/text()").extract()[0]
            ip_list.append((ip, port, speed, type))

            # 暂不进行ip有效性判断,存入数据后统一进行判别(利于提高效率)
            # if judge2_ip(ip,port):
            #     ip_list.append((ip, port, speed, type))
            # else:
            #     pass

        # 方式1:将ip及port写入txt文件,只读'w'方式
        with open('ips.txt', 'w', encoding='utf-8') as f:
            ip_list = tqdm(ip_list, desc='存入txt文件', leave=True)  # 列表变为进度条
            for ip_info in ip_list:
                f.writelines(ip_info[0] + ':' + ip_info[1] + '\n')
        # 方式2:将ip及port存入mysql指定数据库中的指定表中
        conn = pymysql.connect(host='localhost',
                               port=3306,
                               user='******',
                               passwd='123wangchao',
                               charset='utf8',
                               db='proxy_pool')
        cursor = conn.cursor()
        # 先清空原表
        insert_sql1 = ''' truncate table proxy_ip '''
        cursor.execute(insert_sql1)
        conn.commit()
        ip_list = tqdm(ip_list, desc='存入mysql  ', leave=True)  # 列表变为进度条
        for ip_info in ip_list:
            insert_sql2 = '''
                    insert into proxy_ip(ip,port,speed,proxy_type)
                    values('{0}','{1}','{2}','{3}')'''.format(
                ip_info[0], ip_info[1], ip_info[2], ip_info[3])
            # print(insert_sql)
            cursor.execute(insert_sql2)
            conn.commit()
        print('**********************数据获取完成**********************')
Exemplo n.º 3
0
    def _parse_description(self):
        """
        Parse the full description from the bug detail page

        :returns: bug description string
        """
        # NOTE: Using the combination of text/type makes for better testing
        selector = scrapy.selector.Selector(
            text=self.response.body, type='html')

        # This mysterious bit will get all the text for the item description
        # but will also make sure we don't have any html tags
        xpath = ('//div[contains(@class, "issuedescription")]/'
                 'pre/descendant-or-self::*/text()')
        desc = ''.join(selector.xpath(xpath).extract())

        # Remove stray HTML tags
        return desc.strip('\n')
Exemplo n.º 4
0
 def parse_animal_search_criteria(self, response):
     selector = scrapy.selector.Selector(response=response)
     criteria_xpath = '//select[@id="cphSearchArea_ctrlAnimal_ctrlAnimalSearch_ddlCriteria"]/option'
     options = selector.xpath(criteria_xpath)
     return {option.xpath('text()').extract_first(): int(option.xpath('@value').extract_first()) for option in options}