Пример #1
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//div[@id='vmMainPage']/table//tr/td/div/h2",
    'price': "//span[@class='productPrice']/b",
    'category': "",
    'description': "",
    'images':
    "//div[@class='flexible-zoom-additionalImages']/a/@href | //div[@id='ja-current-content']/div[@id='vmMainPage']//a/img/@src",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'format.vn'
allowed_domains = ['format.vn']
start_urls = ['http://format.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+-\d+\.html']), 'parse_item'),
    Rule(LinkExtractor(allow=['/[a-zA-Z-]+\.html']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Пример #2
0
class WeiboRedisSpider(RedisCrawlSpider):
    name = 'weibo_redis'
    allowed_domains = ['weibo.com', 'sina.com.cn']
    redis_key = 'WeiboRedisSpider:start_urls'
    page_links = LinkExtractor(
        restrict_xpaths='//div[@class="m-page"]//ul[@class="s-scroll"]/li/a')
    rules = (Rule(page_links, callback='parse_subjects', follow=True), )

    # 自己定制配置文件
    custom_settings = {
        'ITEM_PIPELINES': {
            # 'weibo_spider.pipelines.MyMongoPipeline': 302,
            'scrapy_redis.pipelines.RedisPipeline': 400
        },
        'DOWNLOADER_MIDDLEWARES': {
            'weibo_spider.middlewares.CookiesMiddleware': 544,
            'weibo_spider.middlewares.ProxyMiddleware': 545
        },
        'REDIS_HOST': '127.0.0.1',
        'REDIS_PORT': 6379,
        'DUPEFILTER_CLASS': 'scrapy_redis.dupefilter.RFPDupeFilter',
        'SCHEDULER': 'scrapy_redis.scheduler.Scheduler',
        'SCHEDULER_PERSIST': True,
        # 使用cookie池
        'COOKIES_URL': 'http://192.168.199.233:5000/weibo/random'
    }

    def __int__(self, subject=None, *args, **kwargs):
        super(WeiboRedisSpider, self).__init__(*args, **kwargs)
        self.subject = subject

    def parse_subjects(self, response):
        div_list = response.xpath(
            '//div[@class="m-con-l"]/div/div[@class="card-wrap"]')
        for odiv in div_list:
            item = WeiboSpiderItem()
            item['title'] = odiv.xpath(
                './div[@class="card-top"]//a/text()').extract_first()
            if item['title']:
                item['title'] = item['title'].strip()
            item['avatar'] = odiv.xpath(
                './div[@class="card"]//div[@class="avator"]/a/img/@src'
            ).extract_first()
            item['nickname'] = odiv.xpath(
                './div[@class="card"]//div[@class="content"]/div[@class="info"]/div[2]/a[1]/text()'
            ).extract_first()
            item['icon'] = odiv.xpath(
                './div[@class="card"]//div[@class="content"]/div[@class="info"]/div[2]/a[2]/@title'
            ).extract_first()
            news = odiv.xpath(
                './div[@class="card"]//div[@class="content"]/p[@class="txt"]')
            if len(news) == 1:
                item['news'] = news[0].xpath(
                    'string(.)').extract_first().replace('\n', '').replace(
                        ' ', '').replace('\u200b', '').replace('收起全文d', '')
            else:
                item['news'] = news[1].xpath(
                    'string(.)').extract_first().replace('\n', '').replace(
                        ' ', '').replace('\u200b', '').replace('收起全文d', '')
            time = odiv.xpath(
                './div[@class="card"]//div[@class="content"]/p[@class="from"]/a[1]/text()'
            ).extract_first()
            now = datetime.now()
            if '秒' in time:
                time = datetime.strftime(
                    now - timedelta(seconds=int(time.split('秒')[0])),
                    '%Y-%m-%d %H:%M:%S')
            elif '分钟' in time:
                time = datetime.strftime(
                    now - timedelta(minutes=int(time.split('分钟')[0])),
                    '%Y-%m-%d %H:%M:%S')
            elif '今天' in time:
                today = re.findall(r'\d+', time)
                time = str(
                    datetime(now.year,
                             now.month,
                             now.day,
                             hour=int(today[0]),
                             minute=int(today[1]),
                             second=0))
            else:
                date = re.findall(r'\d+', time)
                time = str(
                    datetime(now.year,
                             month=int(date[0]),
                             day=int(date[1]),
                             hour=int(date[2]),
                             minute=int(date[3]),
                             second=0))
            item['time'] = time
            item['origin'] = odiv.xpath(
                './div[@class="card"]//div[@class="content"]/p[@class="from"]/a[2]/text()'
            ).extract_first()
            item['collect'] = card_act_int(
                odiv.xpath(
                    './div[@class="card"]/div[@class="card-act"]/ul/li[1]/a/text()'
                ).extract_first().strip().split(' '))
            item['forward'] = card_act_int(
                odiv.xpath(
                    './div[@class="card"]/div[@class="card-act"]/ul/li[2]/a/text()'
                ).extract_first().strip().split(' '))
            item['comment'] = card_act_int(
                odiv.xpath(
                    './div[@class="card"]/div[@class="card-act"]/ul/li[3]/a/text()'
                ).extract_first().strip().split(' '))
            item['like'] = card_act_int(
                odiv.xpath(
                    './div[@class="card"]/div[@class="card-act"]/ul/li[4]/a/em/text()'
                ).extract_first())
            yield item
Пример #3
0
class TeamS01pider(CrawlSpider):
    name = 'Team01'
    allowed_domains = ["soccer.hupu.com"]
    start_urls = ['https://soccer.hupu.com/teams/'
                  ]

    rules = (
        Rule(LinkExtractor(allow=(r'https://soccer.hupu.com/teams/121',)), callback='parse_team01'),
        #Rule(LinkExtractor(allow=(r'https://soccer.hupu.com/g/players/[a-z]-\d.html"',)),callback='parse_player'),
    )

    def parse_team01(self,response):
        tempItems = []

        item = items.TeamItem()

        # 截取球员信息url
        playerUrls = response.selector.xpath('//table[@class="team_player"]/tr/td/a/@href').extract()
        next_links = []
        for url in playerUrls:
            next_links.append(url)

        parent = response.selector.xpath('//div[@class="team_info left"]')[0]

        name = parent.xpath('h3/span/text()').extract()[0]
        if len(name) > 0:
            cnname = re.findall(r'[\u4e00-\u9fa5]+', name)
            enname = re.findall(r'[^\u4e00-\u9fa5]+', name)
            item['TeamENName'] = enname[0].strip() if len(enname) > 0 else ''
            item['TeamCNName'] = cnname[0].strip() if len(cnname) > 0 else ''

        Coach=parent.xpath('//dl[@class="clearfix"]/dd[1]/text()').extract()[0].split(':')
        item['CoachName'] =Coach[1].strip() if len(Coach)>1 else ''
        City= parent.xpath('//dl[@class="clearfix"]/dd[3]/text()').extract()[0].split(':')
        item['City'] =City[1].strip() if len(City)>1 else ''
        Court=parent.xpath('ul[2]/li[1]/text()').extract()[0].split(':')
        item['CourtName'] = Court[1].strip() if len(Court)>1 else ''

        image=parent.xpath('ul[1]/li[@class=" left pic_logo"]/img/@src').extract()
        if len(image)>0:
            item['ImageUrl']='https:'+image[0]
        item['id'] = 0
        item['Remark']=''
        yield item
        for url in next_links:
            yield Request(url, callback=lambda response,teamitem=item:self.parse_player(response,teamitem))

    def parse_player(self,response,teamItem):

        item = items.PlayerItem()
        parent=response.selector.xpath('//ul[@class="player_detail"]')
        CNName=parent.xpath('li[@class="center"]/b[1]/text()').extract()
        item['CNName']=self.checkFirstStr(CNName)

        ENName=parent.xpath('li[@class="center"]/b[2]/text()').extract()
        item['ENName'] = self.checkFirstStr(ENName)

        CountryName=parent.xpath('li[@class="center"]/span[1]/text()').extract()
        item['CountryName'] = self.checkFirstStr(CountryName)

        Birthday = parent.xpath('li[@class="center"]/span[2]/text()').extract()
        item['Birthday'] = self.checkFirstStr(Birthday,'\d{4}-\d{2}-\d{2}')

        BodyWeight=parent.xpath('li[@class="center"]/span[3]/text()').extract()
        item['BodyWeight'] = self.checkFirstStr(BodyWeight,'\d+')

        Height=parent.xpath('li[3]/span[1]/text()').extract()
        item['Height'] = self.checkFirstStr(Height,'\d+')

        TeamName = parent.xpath('li[3]/span[2]/a/text()').extract()
        item['TeamName'] = self.checkFirstStr(TeamName)

        Position = parent.xpath('li[3]/span[3]/text()').extract()
        item['Position'] = self.checkFirstStr(Position)

        Number=parent.xpath('li[3]/span[4]/text()').extract()
        item['Number'] = Number[0] if len(Number) > 0 else ''

        item['TeamId']=teamItem['id']
        item['TeamName']=teamItem['TeamCNName']

        image = parent.xpath('li[1]/img/@src').extract()
        if len(image) > 0:
            item['ImageUrl'] = 'https:' + image[0]
        yield item

    def checkFirstStr(self,list,reg=None):
        if len(list)<=0:
            return ''
        if reg is None:
           return list[0] if len(list) > 0 else ''
        else:
            str = re.findall(reg, list[0])
            return str[0].strip() if len(str) > 0 else ''
    def parse(self, response):

        matched_domain = [x for x in self.allowed_domains if x in response.url]
        if len(matched_domain) > 0:
            domain = matched_domain[0].split('.')[0]

            folder_name = 'Crawled_Data/' + domain.capitalize(
            ) + '_University_Files'

            self.record[domain] = self.record.get(domain, 0) + 1

            if self.record[domain] % 50 == 0:
                print('\n Crawled {} Bio-pages of {} University ...'.format(
                    self.record[domain], domain.capitalize()))
                self.tree.save2file(folder_name + "/00__" +
                                    str(self.record[domain]) + "_tree.txt")

            isBio = self.bio_identifier.is_bio_html_content(
                response.xpath('//*').get())

            if isBio:
                text = BeautifulSoup(response.xpath('//*').get(),
                                     features="html.parser").get_text()
                tokens = nltk.word_tokenize(text)
                normalized_text = ' '.join(
                    [word for word in tokens if word.isalnum()])
                normalized_text += '\n' + response.url

                hash_text = hashlib.md5(response.url.encode())
                file_name = hash_text.hexdigest()

                with open(folder_name + "/" + file_name + ".txt",
                          "w",
                          encoding="utf-8") as file:
                    file.write(normalized_text)

            AllLinks = LinkExtractor(allow_domains=domain + '.edu',
                                     unique=True).extract_links(response)

            for n, link in enumerate(AllLinks):
                if not any([x in link.url for x in self.exclude_words]):
                    if self.tree.get_node(link.url) == None:
                        referer = response.request.headers.get('Referer', None)

                        if referer == None:
                            self.tree.create_node(link.url,
                                                  link.url,
                                                  parent='root')
                        else:
                            referer = referer.decode("utf-8")
                            if self.tree.contains(referer):

                                self.tree.create_node(link.url,
                                                      link.url,
                                                      parent=referer)
                            else:
                                self.tree.create_node(link.url,
                                                      link.url,
                                                      parent='unknown')

                        yield scrapy.Request(url=link.url, callback=self.parse)
Пример #5
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//div[@class='product_info']/h1",
    'price' : "//div[@class='price-box']/span[@class='regular-price']/span[@class='price']",
    'category' : "//div[@class='breadcrumb']/a",
    'description' : "//div[@class='tab-content']//div[@class='info']",
    'images' : "//img[@id='image']/@src",
    'canonical' : "//link[@rel='canonical']/@href",
    'base_url' : "",
    'brand' : ""
}
name = 'cafe.net.vn'
allowed_domains = ['cafe.net.vn']
start_urls = ['http://www.cafe.net.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    #Rule(LinkExtractor(), 'parse_item'),
    #Rule(LinkExtractor(), 'parse'),
    Rule(LinkExtractor(allow=['/en/[a-zA-Z0-9-]+\.html']), 'parse_item_and_links'),
]
Пример #6
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//div[@class='box_menu_top itemHeader']/h2[@class='itemTitle']",
    'price': "//div[@class='itemImageBlock']/form/span",
    'category': "",
    'description': "//div[@class='itemBody']/div[@class='itemFullText']/ul[2]",
    'images':
    "//div[@class='itemImageBlock']/span[@class='itemImage']/a/@href",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'goquymynghe.com'
allowed_domains = ['goquymynghe.com']
start_urls = ['http://goquymynghe.com']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/item/']), 'parse_item'),
    Rule(LinkExtractor(allow=['/.*']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Пример #7
0
class DoctorlistSpider(RedisCrawlSpider):
    name = 'doctorList'
    allowed_domains = ['ask.39.net']

    # scrapy-redis
    redis_key = 'myspider:start_urls'
    # 规则爬虫
    rules = (
        Rule(LinkExtractor(allow=r'question/(\d+).html'),
             callback='parse_item',
             follow=True),
        # Rule(LinkExtractor(allow=r'http://my.39.net/wulinqing'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        print('response.url===========', response.url)
        item = DoctorItem()
        div = response.xpath(
            '//div[@class="sele_all marg_top"] | //div[@class="sele_all"]')
        for dd in div:
            # 医生姓名
            name = dd.xpath(
                './div[1]/div[2]/p[@class="doc_xinx"]/span[1]/text()').extract(
                )
            item['name'] = ''.join(name)
            # 医生级别
            level = dd.xpath(
                './div[1]/div[2]/p[@class="doc_xinx"]/span[2]/text()').extract(
                )
            item['level'] = ''.join(level)
            # 工作单位
            company = dd.xpath(
                './div[1]/div[2]/p[@class="doc_xinx"]/span[3]/text()').extract(
                )
            # 擅长的领域
            good = dd.xpath(
                './div[1]/div[2]/p[@class="doc_sc"]/span/text()').extract()
            if len(div.xpath(".//span[@class='doc_yshi']/text()")) > 1:
                # 医院
                item['company'] = ''.join(company)
            else:
                item['company'] = ''
            if len(div.xpath(".//p[@class='doc_sc']/span/text()")) > 0:
                # 擅长
                item['good'] = ''.join(good)
            else:
                item['good'] = ''
            # 回答答案
            detail = dd.xpath('./p/text()').extract()
            item['detail'] = ''.join(detail)
            # 回答时间
            time = dd.xpath(
                './div[@class="doc_t_strip"]/div[1]/p/text()').extract()
            item['time'] = ''.join(time[0])
            # 帮助人数
            pid = dd.xpath('.//div[@class="doctor_all"]/@mid').extract()[0]
            json_url = 'http://askdatafmy.39.net/home/askapi.ashx?callback=jQuery172033868943235912363_1539677691886&action=doctorTopicCount&pid=' + pid
            item['helpNum'] = json.loads(
                requests.get(json_url).text.split("()")[0].split("(")[1].split(
                    ")")[0])["data"]["all"]
            item['link'] = response.url
            yield item
Пример #8
0
class uyghurcongress(RedisCrawlSpider):
    name = 'uyghurcongress'
    start_urls = ['http://www.uyghurcongress.org/']
    # redis_key='middleway:urls'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    }

    rules = (Rule(LinkExtractor(
        allow='http\:\/\/www\.uyghurcongress\.org\/en\/\?p\=\d*', ),
                  callback='parse_content',
                  follow=True),
             Rule(LinkExtractor(
                 allow='http\:\/\/www\.uyghurcongress\.org\/.*', ),
                  follow=True))

    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time_list=[]):
            if not publish_time_list:
                print('time is None')
                return None
            mouthstr = publish_time_list[0]

            mouth_eng_to_num = {
                'January': '01',
                'February': '02',
                'March': '03',
                'April ': '04',
                'May': '05',
                'June': '06',
                'July': '07',
                'August': '08',
                'September': '09',
                'October': '10',
                'November': '11',
                'December': '12'
            }

            mouth_num = mouth_eng_to_num[str(mouthstr)]
            day_str_raw = str(publish_time_list[1])
            if len(day_str_raw) < 2:
                day_str_raw = '0' + day_str_raw

            publish_time = str(
                publish_time_list[2]
            ) + '-' + mouth_num + '-' + day_str_raw + ' 00:00:00'
            return publish_time

        def deal_reply_nodes(reply_nodes=None):
            reply_nodes_list = []

            def deal_publishtime_inside(publishtime):
                publish_time = publishtime.replace('年', '-').replace(
                    '月', '-').replace('日', '')
                time_split_2 = publish_time.split(' ')

                data_str = time_split_2[0]
                data_str_list = data_str.split('-')
                mounth = data_str_list[1]
                day = data_str_list[2]
                if len(mounth) < 2:
                    mounth = '0' + mounth
                if len(day) < 2:
                    day = '0' + day
                data_str = data_str_list[0] + '-' + mounth + '-' + day

                time_split_2_part2 = time_split_2[1]
                if '下午' in time_split_2_part2:
                    time_part2_h_m = time_split_2_part2.replace('下午',
                                                                '').split(':')
                    time_split_2_h = int(time_part2_h_m[0])
                    time_split_2_m = time_part2_h_m[1]
                    time_split_2_h_add = 12 + time_split_2_h

                    time_pm_finally = str(
                        time_split_2_h_add) + ':' + time_split_2_m + ':00'
                    return data_str + ' ' + time_pm_finally
                elif '上午' in time_split_2_part2:
                    time_part2_h_m = time_split_2_part2.replace('上午',
                                                                '').split(':')
                    time_split_2_h = int(time_part2_h_m[0])
                    time_split_2_m = time_part2_h_m[1]
                    time_split_2_h_add = time_split_2_h

                    if time_split_2_h_add < 10:
                        time_split_2_h_add = '0' + str(time_split_2_h)

                    time_am_finally = str(
                        time_split_2_h_add) + ':' + time_split_2_m + ':00'
                    return data_str + ' ' + time_am_finally

            if reply_nodes:
                reply_nodes_list_eval = eval(reply_nodes[0])
                for one_reply_nodes in reply_nodes_list_eval:
                    content = one_reply_nodes['body']
                    publish_time_raw = one_reply_nodes['displayTime']
                    publish_time = deal_publishtime_inside(publish_time_raw)

                    id = one_reply_nodes['id']
                    publish_user_photo = one_reply_nodes['author']['avatarUrl']
                    publish_user = one_reply_nodes['author']['name']

                    child_reply_node = {
                        'content':
                        content,
                        'publish_time':
                        publish_time,
                        'id':
                        id,
                        'publish_user_href':
                        'http:' + publish_user_photo if 'http'
                        not in publish_user_photo else publish_user_photo,
                        'publish_user':
                        publish_user
                    }
                    reply_nodes_list.append(child_reply_node)

                return reply_nodes_list
            else:
                return None

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath(
            'title',
            '//div[@class="post-content"]//h3[@class="post-title"]/text()',
            TakeFirst(), lambda x: x.strip())
        loader1.add_xpath(
            'content',
            '//div[@class="post-content"]//p/text()|//div[@class="post-content"]//h1/text()|//div[@class="post-content"]//h2/text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id',
                          response.url.split('=')[1].split('.')[0].strip())
        loader1.add_xpath('img_urls', '//div[@class="post-content"]//img/@src')
        loader1.add_value(
            'publish_time',
            response.xpath(
                '//div[@class="post-content"]//p[@class="post-meta"]').re(
                    '(\S*) (\d{1,2})\, (\d{4})'), deal_publish_time)
        # loader1.add_value('publish_user','degewa')
        # loader1.add_value('reply_count',response.selector.xpath('//*[@id="comments"]/h4/text()').re(ur'(\d{1,2}).*条评论'),lambda x:x[0] if x else 0)
        # loader1.add_value('reply_nodes',response.selector.re(ur'var items \= (\[.*?\])\;'),deal_reply_nodes)

        item = loader1.load_item()
        print(item)
        return item
Пример #9
0
 def parse(self, response):
     xlink = LinkExtractor()
     itemre = re.compile(self.itemurl_re)
     for link in xlink.extract_links(response):
         if itemre.search(link.url):
             yield Request(url=link.url, callback=self.parse_item)
Пример #10
0
XPATH = {
    'name': "//h1[@class='mrpH']",
    'price': "//span[@class='mcrp']",
    'category': "//div[@class='mrbn']/a",
    'description': "//div[@class='des1 cl']",
    'images': "//div[@id='p_inner']/div/a/@href",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'giadinhmart.vn'
allowed_domains = ['giadinhmart.vn']
start_urls = ['http://giadinhmart.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(
        LinkExtractor(allow=['/giadinhmart.vn/[a-z0-9-]+-\d+.aspx'],
                      deny=[
                          '/huong-dan-mua-hang', '/phuong-thuc-thanh-toan',
                          '/chinh-sach-hau-mai', '/gioi-thieu-giadinhmartvn',
                          'danh-muc'
                      ]), 'parse_item'),
    Rule(LinkExtractor(allow=['giadinhmart.vn/ca/[a-z0-9-]+-\d+.aspx']),
         'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Пример #11
0
class CarsSpider(CrawlSpider):
    name = "cars"
    allowed_domains = ["www.olx.ro"]
    start_urls = open('links.txt').readlines()

    rules = (Rule(LinkExtractor(allow=(), restrict_css=('.pageNextPrev', )),
                  callback="parse_item",
                  follow=True), )

    def parse_item(self, response):
        item_links = response.css(
            'table .wrap .offer h3 a::attr(href)').extract()
        for a in item_links:
            yield scrapy.Request(a, callback=self.parse_detail_page)

    def parse_detail_page(self, response):
        id = response.css('.offer-bottombar__item > strong::text').get()
        title = response.css('h1::text').get().strip()
        price = response.css('.pricelabel > strong::text').get()
        user = response.css('.offer-user__actions h4 a::attr(href)').get()
        location = response.css('address p::text').get()

        # properties
        properties = response.css('.offer-details__name *::text').getall()
        values = response.css('.offer-details__value *::text').getall()
        for i, p in enumerate(properties):
            if p == 'Oferit de':
                owner_type = values[i]
            elif p == 'Marca':
                brand = values[i]
            elif p == 'Model':
                model = values[i]
            elif p == 'Culoare':
                color = values[i]
            elif p == 'Combustibil':
                fuel_type = values[i]
            elif p == 'Cutie de viteze':
                gearbox_type = values[i]
            elif p == 'An de fabricatie':
                year = values[i]
            elif p == 'Rulaj':
                mileage = values[i]
            elif p == 'Caroserie':
                body_type = values[i]
            elif p == 'Capacitate motor':
                engine_displacement = values[i]
            elif p == 'Stare':
                condition = values[i]

        description = response.xpath(
            './/div[@id="textContent"]/text()').getall()
        # description = [i.strip() + '\n' for i in list(response.xpath('.//div[@id="textContent"]/text()').getall())]
        post_date = response.css('em > strong::text').get()[3:]
        views = response.css('.offer-bottombar__counter > strong::text').get()

        item = OlxItem()
        item['id'] = id
        item['title'] = title
        item['price'] = price
        item['user'] = user
        item['url'] = response.url
        item['location'] = location
        item['description'] = description

        item['owner_type'] = owner_type
        item['brand'] = brand
        item['model'] = model
        item['color'] = color
        item['fuel_type'] = fuel_type
        item['gearbox_type'] = gearbox_type
        item['year'] = year
        item['mileage'] = mileage
        item['body_type'] = body_type
        item['engine_displacement'] = engine_displacement
        item['condition'] = condition

        item['post_date'] = post_date
        item['views'] = views
        yield item
Пример #12
0
class NySpider(CrawlSpider):
    name = 'NY'
    allowed_domains = ['www.in-en.com','www.china-nengyuan.com','www.chinaoilonline.com']
    start_urls = [
                  # 'http://www.china-nengyuan.com/news/news_list_1.html',
                  # 'https://www.in-en.com/article/news/intl/',
                  # 'https://www.in-en.com/article/news/china/',
                  # 'https://www.in-en.com/article/news/china/',
                  'http://www.chinaoilonline.com/article.do?method=toArticleListByType2ByTypeidList&p1=15&p1=16&p1=17&p1=18&pageNo=1&pageSize=22&pid=2&subLanmuId=2&rightshow=&typeid=23&titleLength=30'
                  ]#https://www.in-en.com/article/news/intl/','https://www.in-en.com/article/news/china/',
    custom_settings = {
        # 并发请求
        'CONCURRENT_REQUESTS': 10,
        # 'CONCURRENT_REQUESTS_PER_DOMAIN': 1000000,
        'CONCURRENT_REQUESTS_PER_IP': 0,
        # 下载暂停
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {
            # 设置异步入库方式
            'HY_NEWS.pipelines.MysqlTwistedPipeline': 600,
            # 去重逻辑
            # 'HY_NEWS.pipelines.DuplicatesPipeline': 200,
        },
        'DOWNLOADER_MIDDLEWARES': {
            # 调用 scrapy_splash 打开此设置
            # 'scrapy_splash.SplashCookiesMiddleware': 723,
            # 'scrapy_splash.SplashMiddleware': 725,

            # 设置设置默认代理
            'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 700,
            # 设置请求代理服务器
            # 'HY_NEWS.util_custom.middleware.middlewares.ProxyMiddleWare': 100,
            # 设置scrapy 自带请求头
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            # 自定义随机请求头
            'HY_NEWS.util_custom.middleware.middlewares.MyUserAgentMiddleware': 120,
            # 重试中间件
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 重试中间件
            'HY_NEWS.util_custom.middleware.middlewares.MyRetryMiddleware': 90,
        },
        # 调用 scrapy_splash 打开此设置
        # 'SPIDER_MIDDLEWARES': {
        #     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
        # },
        # 去重/api端口
        # 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
        # # 'SPLASH_URL': "http://10.8.32.122:8050/"
        # 'SPLASH_URL': "http://127.0.0.1:8050/"
    }
    rules = (
        Rule(LinkExtractor(restrict_css='div.bd > ul:nth-child(1) > div > a:nth-child(14) '), follow=True),
        Rule(LinkExtractor(restrict_css='.imgBox a '), callback='parse_item', follow=True),
        Rule(LinkExtractor(restrict_css='.member_tr_row a '), callback='parse_item', follow=True),
        Rule(LinkExtractor(restrict_css='.zxwk_li a'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = HyNewsItem()
        resp = response.text
        extractor = GeneralNewsExtractor()
        result = extractor.extract(resp, with_body_html=False)
        title = result['title']
        txt = result['content']
        p_time = result['publish_time']
        lyurl = response.url
        lyname = '能源'
        content_css = [
            '#content',
            '.news_link',
            '#zxwk_left_1'

        ]
        for content in content_css:
            content = ''.join(response.css(content).extract())
            if content:
                break
            if not content:
                logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
        classify, codes, region = get_category(txt)
        item['title'] = title
        item['txt'] = txt
        item['p_time'] = get_times(p_time)
        item['content'] = content
        item['spider_name'] = 'NY'
        item['module_name'] = '行业新闻'
        item['cate'] = classify
        item['region'] = region
        item['code'] = codes
        item['link'] = lyurl
        item['website'] = lyname
        if content:
            yield item
Пример #13
0
 def __init__(self, place='amsterdam'):
     self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page_number) for page_number in range(1, 2)]
     self.base_url = "http://www.funda.nl/koop/%s/" % place
     self.le1 = LinkExtractor(allow=r'%s+(huis|appartement)-\d{8}' % self.base_url)
Пример #14
0
class ChengduSpider(CrawlSpider):
    name = 'chengdu'
    allowed_domains = ['chengdu.gov.cn']
    start_urls = ['http://www.chengdu.gov.cn/chengdu/zfxx/zfxx.shtml']

    rules = (Rule(LinkExtractor(allow=r'.*chengdu.gov.cn/chengdu/c131029.*'),
                  callback='parse_page',
                  follow=False), )

    cont_dict = {}

    def parse_item(self, response):
        print("5. parse_item(): " +
              datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') +
              " -> " + response.url)
        title = response.xpath(
            "//*[@id='bar']/div[1]/div[1]/table/tbody/tr[2]/td/text()").get()
        cont = response.xpath("//*[@class='text_content'").get()
        index_id = str("_NULL")
        pub_org = response.xpath(
            "//*[@id='zc']/div[3]/table/tbody/tr[1]/td[2]/text()").get()

        pub_time = response.xpath(
            "//*[@id='zc']/div[3]/table/tbody/tr[2]/td[1]/text()").get()
        doc_id = response.xpath(
            "//*[@id='zc']/div[3]/table/tbody/tr[1]/td[1]/text()").get()
        region = str('成都')
        update_time = datetime.datetime.now().strftime("%Y-%m-%d 00:00:00")

        if not title:
            return

        print("\t>>> " + title)
        for key in keys:
            if key in title:
                self.dict_add_one(re.sub('[\s+]', ' ', title), response.url,
                                  re.sub('[\s+]', ' ', cont),
                                  re.sub('[\s+]', ' ', pub_time),
                                  re.sub('[\s+]', ' ', pub_org), index_id,
                                  doc_id, region, update_time, key)

        item = YqcChengduSpiderItem(cont_dict=self.cont_dict)

        yield item

    def dict_add_one(self, title, url, cont, pub_time, pub_org, index_id,
                     doc_id, region, update_time, doc_key):
        time.sleep(0.3)
        if title in self.cont_dict:
            self.cont_dict[title]['key_cnt'] += 1
            self.cont_dict[title][
                'doc_key'] = self.cont_dict[title]['doc_key'] + ',' + doc_key
        else:
            cnt_dict = {
                'key_cnt': 1,
                'title': title,
                'url': url,
                'cont': cont,
                'pub_time': pub_time,
                'pub_org': pub_org,
                'index_id': index_id,
                'doc_id': doc_id,
                'region': region,
                'update_time': update_time,
                'doc_key': doc_key
            }

            self.cont_dict[title] = cnt_dict

    def parse_page(self, response):
        url = response.url

        print("4. parse_page(): " +
              datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') +
              " -> " + url)

        url_prefix = 'http://www.nanning.gov.cn/xxgk/xxgkml/jcxxgk/zcwj/zfwj'

        if str('REPORT_NDOC_006051') in url or str(
                'REPORT_NDOC_006010') in url:
            print("\t>>> debug: " + url)

        if str('currentPage') in url:
            print('currentPage exist')
            tr_list = response.xpath(
                "//*[@id='main']/div[1]/div/div[2]/table/tbody//tr")

            for tr in tr_list:
                # print(tr)
                url = tr.xpath("./td[1]/a/@href").get()
                full_url = url_prefix + url

                yield scrapy.Request(full_url, callback=self.parse_item)

        else:
            print('no currentPage exist')
            if str('REPORT_NDOC_006051') in url or str(
                    'REPORT_NDOC_006010') in url:
                print('\t>>> no currentPage')

            title = response.xpath("//*[@class='detai_title']/text()").get()
            cont = response.xpath("//*[@class='text_content'").get()
            index_id = str("_NULL")
            pub_org = response.xpath(
                "//*[@id='zc']/div[3]/table/tbody/tr[1]/td[2]/text()").get()

            pub_time = response.xpath(
                "//*[@id='zc']/div[3]/table/tbody/tr[2]/td[1]/text()").get()
            doc_id = response.xpath(
                "//*[@id='zc']/div[3]/table/tbody/tr[1]/td[1]/text()").get()
            region = str('成都')
            update_time = datetime.datetime.now().strftime("%Y-%m-%d 00:00:00")

            if not title:
                return
            print("\t>>> " + title)
            for key in keys:
                if key in title:
                    # print("\t>>> included")
                    self.dict_add_one(re.sub('[\s+]', ' ',
                                             title), response.url,
                                      re.sub('[\s+]', ' ', cont),
                                      re.sub('[\s+]', ' ',
                                             pub_time), pub_org, index_id,
                                      doc_id, region, update_time, key)

            item = YqcChengduSpiderItem(cont_dict=self.cont_dict)

            print("6. parse_page(): " +
                  datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') +
                  " -> " + url)
            # print("\n")
            # print(item)

            yield item
Пример #15
0
class CompetitivecyclistSpider(CrawlSpider):
    name = u'competitivecyclist.com'
    allowed_domains = ['competitivecyclist.com']
    start_urls = [
        'http://www.competitivecyclist.com/Store/sitemaps/categoriesIndex.jsp',
    ]
    
    rules = (
        Rule(LinkExtractor(allow='page=')),
        Rule(LinkExtractor(), callback='parse_category')
        )

    def parse_category(self, response):
        base_url = get_base_url(response)

        products = response.xpath('//div[@id="products"]//a/@href').extract()
        for url in products:
            yield Request(urljoin_rfc(base_url, url), callback=self.parse_product)

        next_page = response.xpath('//li[@class="pag-next"]/a/@href').extract()
        if products:
            # This is to prevent some strange issues with website where it shows next page but there are no products
            for url in next_page:
                yield Request(urljoin_rfc(base_url, url), callback=self.parse_category)

    def parse_product(self, response):
        base_url = get_base_url(response)

        product_links = response.xpath('//div[@id="products"]//a[contains(@class,"qa-product-link")]/@href').extract()
        if product_links:
            for link in product_links:
                yield Request(url_query_cleaner(response.urljoin(link)), callback=self.parse_product)
            return

        product_name = response.xpath('//h1[@itemprop="name"]/text()').extract()
        if not product_name:
            return
        product_name = product_name[-1].strip()
        category = re.findall("name:'Category', value:'([^']+)'", response.body.replace("\\'", "&quote;"))
        if category:
            category = category.pop().replace("&quote;", "'")
        else:
            category = ""
        brand = response.xpath('//h1[@itemprop="name"]/span/text()').extract()
        brand = brand[0].strip() if brand else ''

        rrp_by_sku = {}

        sku_data = re.search(r'BC.product.skusCollection = \$.parseJSON\((.*)\);', response.body)
        if sku_data:
            sku_data = json.loads(demjson.decode(sku_data.group(1), encoding='utf8' ))
            rrp_by_sku = {sku.upper():str(opt['price']['high']) for sku, opt in sku_data.iteritems() if opt['price']['high']>opt['price']['low']}


        options = response.xpath('//li[contains(@class,"qa-variant-item-")]')
        for option in options:
            product_loader = ProductLoader(item=Product(), selector=option)
            sku = option.xpath('./@sku-value').extract()
            sku = sku[0]
            product_loader.add_value('sku', sku)
            product_loader.add_value('identifier', sku)
            option_name = option.xpath('./@title').extract()[0].strip()
            option_name = option_name.replace('One Color, One Size', '').replace(', One Size', '').replace('One Color, ', '').strip()
            if option_name != '':
                product_loader.add_value('name', product_name + ', ' + option_name)
            else:
                product_loader.add_value('name', product_name)
            image_url = option.xpath('./@data-img-large').extract()
            if image_url:
                product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))
            price = extract_price(option.xpath('./@data-price').extract()[0])
            product_loader.add_value('price', price)
            product_loader.add_value('url', response.url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', category)
            product = product_loader.load_item()
            metadata = CRCMeta()
            metadata['rrp'] = rrp_by_sku.get(sku.upper(), '')
            product['metadata'] = metadata
            yield product
Пример #16
0
class YahoomovieSpider(CrawlSpider):
    name = "yahoomovie"
    allowed_domains = ["yahoo.com.tw"]
    start_urls = ["https://movies.yahoo.com.tw/movie_intheaters.html?page=1"]

    IMAGE_DIR = MEDIA_ROOT
    # IMAGE_DIR = "D:\\Users\\Administrator\\gb5566\\yahoo_ptt\\media\\movie\\images\\yahoo"

    custom_settings = {
        "IMAGES_STORE": IMAGE_DIR,
        "DOWNLOAD_DELAY": 3,
        "ITEM_PIPELINES": {
            "crawlmovie.pipelines.CustomImagePipeline": 1,
            "crawlmovie.pipelines.YahooPipeline": 100,
            "crawlmovie.pipelines.DeleteNullTitlePipeline": 200,
            "crawlmovie.pipelines.DuplicatesTitlePipeline": 200,
            "crawlmovie.pipelines.CsvExportPipeline": 300,
        },
        "AUTOTHROTTLE_ENABLED": True,
        # The initial download delay
        "AUTOTHROTTLE_START_DELAY": 5,
        # The maximum download delay to be set in case of high latencies
        "AUTOTHROTTLE_MAX_DELAY": 60,
        # The average number of requests Scrapy should be sending in parallel to
        # each remote server
        "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0,
        # "CLOSESPIDER_ITEMCOUNT": 150,
    }

    rules = (
        Rule(
            LinkExtractor(
                restrict_xpaths="//div[@class='release_movie_name']/a"),
            callback="parse_item",
            follow=True,
        ),
        Rule(LinkExtractor(restrict_xpaths="//li[@class='nexttxt']/a")),
    )

    def parse_item(self, response):
        item = YahooCloudItem()
        title = response.xpath(
            "normalize-space(//div[@class='movie_intro_info_r']/h1/text())"
        ).extract()
        item["title"] = "".join(title)

        critics_consensus = response.xpath(
            "normalize-space(//span[@id='story']/text())").extract()
        item["critics_consensus"] = "".join(
            [i.replace(u"\xa0", u"") for i in critics_consensus])
        item["release_date"] = response.xpath(
            "(//div[@class='movie_intro_info_r']/span[1]/text())").extract()[0]

        duration = response.xpath(
            "//div[@class='movie_intro_info_r']/span[2]/text()").extract()
        item["duration"] = "".join(
            [i.replace(u"\\u3000\\", u"") for i in duration])

        item["genre"] = response.xpath(
            "normalize-space((//div[@class='level_name'])[2]/a/text())"
        ).extract()
        # i['rating'] = response.css(
        #     '.ratingValue ::text').extract()[1]
        item["rating"] = response.xpath(
            "//div[@class='score_num count']/text()").extract()
        item["amount_reviews"] = response.xpath(
            "//div[@class='circlenum']/div[@class='num']/span/text()").extract(
            )
        url = response.xpath(
            "//div[@class='movie_intro_foto']/img/@src").extract()
        link = "".join(url)
        item["images"] = {item["title"]: link}

        yield item
Пример #17
0
class TouzishijianSpider(CrawlSpider):
	name = 'huang114_all'
	allowed_domains = ['114chn.com']
	start_urls = ['http://www.114chn.com/']
	custom_settings = {
		'DEFAULT_REQUEST_HEADERS': {
			'upgrade-insecure-requests': "1",
			# 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
			'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
			'referer': "http://search.114chn.com/searchresult.aspx?type=1&areaid=31&pattern=2&page=100",
			'accept-encoding': "gzip, deflate",
			'accept-language': "zh-CN,zh;q=0.8",
			# 'cookie': "bdshare_firstime=1509612560767; UM_distinctid=15f7bed65c32e0-01af79ead3a85f-31637e01-13c680-15f7bed65c4735; Hm_lvt_40b8d9bb56b7b9b3fee170b6b9b4bc8e=1509612561; Hm_lpvt_40b8d9bb56b7b9b3fee170b6b9b4bc8e=1509613182; CNZZDATA30067493=cnzz_eid%3D1102648662-1510104203-http%253A%252F%252Fsearch.114chn.com%252F%26ntime%3D1510104203",
			'cache-control': "no-cache",
			'postman-token': "b710e80f-5152-1b73-ed8c-b5342bd0c5a9"
		}
	}

	# def start_requests(self):
	# 	burl = 'http://search.114chn.com/searchresult.aspx?type=1&key={k}&pattern=2&page=1'
	# 	x = 0
	# 	while True:
	# 		comp_name = rc.spop('zhuce_names')
	# 		if not comp_name:
	# 			x += 1
	# 			if x > 5:
	# 				raise CloseSpider('no datas')
	# 			time.sleep(60)
	# 			continue
	# 		url = burl.format(k=comp_name)
	# 		yield scrapy.Request(url, meta={'dont_redirect': True})

		# start_url = "http://search.114chn.com/searchresult.aspx?type=1&areaid={area}&pattern=2&page=100"
		# for i in range(100):
		# 	yield scrapy.Request(start_url.format(area=str(i)), dont_filter=True)

	rules = (
		# Rule(LinkExtractor(allow=('searchresult',))),
		Rule(LinkExtractor(allow=('.*',), deny=('s\.114chn', 'Error\.htm'))),
		Rule(
			LinkExtractor(
				allow=(
					'TradeDetail\.aspx',
					'Free\.aspx'
				)
			),
			callback='parse_item'
		),
	)

	def parse_item(self, response):
		item = huang114AllItem()
		if '很抱歉!页面在您访问时发生了错误' in response.text or '对不起 !' in response.text:
			return
		select = Selector(text=response.text)
		if 'freeindex' in response.url:
			comp_name = select.xpath('//*[@id="lblCompanyName"]//text()').extract_first()
			link_man = select.xpath('//*[@id="lblLinkMan"]//text()').extract_first()
			tel = select.xpath('//*[@id="lblTel"]/text()').extract_first()
			email = select.xpath('//*[@id="lblEmail"]/text()').extract_first()
			addr = select.xpath('//*[@id="lblAddress"]/text()').extract_first()
			t = select.xpath('//*[@id="lblContent"]//text()').extract()
			intro = ''.join(t) if t else ''
		else:
			comp_name = select.xpath('//div[@class="zitifree"]/text()').extract_first()
			link_man = select.xpath('//*[@class="lblLinkMan"]//text()').extract_first()
			tel = select.xpath('//*[@class="lbltel"]/text()').extract_first()
			email = select.xpath('//*[@class="lblemail"]/text()').extract_first()
			addr = select.xpath('//*[@class="lblweb"]/text()').extract_first()
			t = select.xpath('//div[@class="xinxi"]//text()').extract()
			intro = ''.join(t) if t else ''
		item['comp_url'] = response.url
		item['comp_name'] = comp_name.strip() if comp_name else ''
		item['link_man'] = link_man
		item['tel'] = tel
		item['email'] = email
		item['addr'] = addr
		item['intro'] = intro
		yield item
Пример #18
0
class VacanciesSpider(scrapy.spiders.CrawlSpider):
    """
    This class defines a crawler spider for the contents of the Airbus
    Job portal vacancies
    """
    # Define module logger
    logger = logger_setup.setup_module_logger(__name__)

    # Spider properties
    name = 'vacancies'
    allowed_domains = ['airbus.com']
    start_urls = [
        'https://www.airbus.com/careers/search-and-apply/'
        'search-for-vacancies.html/?page=1'
    ]

    # Scraped data
    scraped_data = []

    @staticmethod
    def get_next_listing_page(this_page: str) -> str:
        """
        This static method generates the link for the next vacancies listing
        page.
        :param this_page: String with the url of the current page
        :return: String with the url of the next page to index
        """
        this_page_number = int(
            re.search('(?<=(\\?page\\=))(\\d+)', this_page).group())
        next_page = re.sub('(?<=(\\?page\\=))(\\d+)',
                           str(this_page_number + 1), this_page)
        return next_page

    def parse_vacancies_links(self, response: scrapy.http.Response) -> \
            scrapy.http.Request:
        """
        This method gets the links of the vacancies listed in the response,
        requests its own response and calls the ´´parse_vacancy_contents´´
        method for each of them to parse its data.
        :param response: Scraped response of the listing page
        :return: Request of parsing the contents of each listed vacancy
        """
        # self.logger.info('Processing listing page: %s', response.url)
        for href in response.xpath(
                "//section[@class='c-jobsearchpage__content']"
                "//div[@class='c-jobcarousel__slider--title']"
                "//a/@href").getall():
            yield scrapy.Request(response.urljoin(href),
                                 self.parse_vacancies_contents)

    def parse_vacancies_contents(self, response: scrapy.http.Response) -> None:
        """
        This method parses the contents of the vacancy from the scraped web
        page and stores them in the fields of an Scrapy Item.
        :param response:
        :return:
        """
        # Parse vacancy fields
        vacancy = vacancy_item.Vacancy()
        # Url
        vacancy['url'] = response.url
        # Title
        vacancy['title'] = response.xpath(
            "//div[@class='c-jobdetails']"
            "//div[@class='header-job']"
            "//h2[@class='c-banner__title col-xs-12 col-sm-12']/text()").get()
        # Posting Date
        vacancy['published_date'] = response.xpath(
            "//div[@class='c-jobdetails']"
            "//div[@class='header-job']"
            "//div[@class='c-banner__jobinfo-ligne'][1]"
            "/span[2]/text()").get()
        # Division
        vacancy['division'] = response.xpath(
            "//div[@class='c-jobdetails']"
            "//div[@class='header-job']"
            "//div[@class='c-banner__jobinfo-ligne'][2]"
            "/span[2]/text()").get()
        # Location
        vacancy['location'] = str(
            response.xpath("//div[@class='c-jobdetails']"
                           "//div[@class='header-job']"
                           "//div[@class='c-banner__jobinfo-ligne'][3]"
                           "/span[2]/text()").get()).strip()
        # Reference Code
        vacancy['reference_code'] = str(
            response.xpath("//div[@class='c-jobdetails']"
                           "//div[@class='c-banner__jobinfo-botton-ligne']"
                           "//span[text()='External code']"
                           "/parent::*/span[2]/text()").get()).strip()
        # Functional Area
        vacancy['job_family'] = str(
            response.xpath("//div[@class='c-jobdetails']"
                           "//div[@class='c-banner__jobinfo-botton-ligne']"
                           "//span[text()='Job family']"
                           "/parent::*/span[2]/text()").get()).strip()
        # Contract Type
        vacancy['contract_type'] = str(
            response.xpath("//div[@class='c-jobdetails']"
                           "//div[@class='c-banner__jobinfo-botton-ligne']"
                           "//span[text()='Contract type']"
                           "/parent::*/span[2]/text()").get()).strip()
        # Work Experience
        vacancy['work_experience'] = str(
            response.xpath("//div[@class='c-jobdetails']"
                           "//div[@class='c-banner__jobinfo-botton-ligne']"
                           "//span[text()='Experience level']"
                           "/parent::*/span[2]/text()").get()).strip()
        # Working Time
        vacancy['working_time'] = str(
            response.xpath("//div[@class='c-jobdetails']"
                           "//div[@class='c-banner__jobinfo-botton-ligne']"
                           "//span[text()='Working Time']"
                           "/parent::*/span[2]/text()").get()).strip()
        # Description
        vacancy['description'] = html2text.html2text(
            response.xpath("//div[@class='has-padding c-contentjob']"
                           "//h2[text()='Job Description']"
                           "/parent::*/div[2]").get())

        self.logger.info('Processing vacancy: %s --> %s', vacancy.get('title'),
                         vacancy.get('url'))
        self.scraped_data.append(vacancy)

    def parse_start_url(self, response: scrapy.http.Response):
        """
        This dummy function is requested for not to skipping the indexing of
        the initial listing page.
        :param response:
        :return:
        """
        return self.parse_vacancies_links(response)

    def closed(self, reason: str) -> None:
        """
        This method is called on spider closing and prints the parsed vacancies
        for debugging purposes. It will be removed in the future.
        :param reason: The reason of the spider closing
        :return:
        """
        print("Spider will close, reason: {}".format(reason))
        print("The scraped data is as follows:")
        print(self.scraped_data)

    rules = (scrapy.spiders.Rule(LinkExtractor(
        allow=(),
        restrict_css=('a.c-pagination--item.link.'
                      'c-jobsearchpage_searchlink.current', ),
        tags=('a', ),
        attrs=('href', ),
        process_value=get_next_listing_page.__func__),
                                 callback="parse_vacancies_links",
                                 follow=True), )
Пример #19
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//div[@class='product-description']/div[@class='pd-name']/h1",
    'price' : "//div[@class='pctemp chudam']/div/span[@class='sub1 txt_20 chudam']",
    'category' : "//div[@id='location_hunv']/div[1]/a",
    'description' : "//div[@class='pcdetails']/div/p",
    'images' : "//div[@class='framehb imgd11 pn_img']//a/@href | //div[@class='thumb-view']//a/@href",
    'canonical' : "//link[@rel='canonical']/@href",
    'base_url' : "",
    'brand' : ""
}
name = 'shopbevame.com'
allowed_domains = ['shopbevame.com']
start_urls = ['http://shopbevame.com/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(), 'parse_item'),
    Rule(LinkExtractor(), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Пример #20
0
XPATH = {
    'name': "//div[@id='left_']/div[3]/h2",
    'price':
    "//div[@class='specifications']/div[5]/ul/li/form/label/p[2]/span",
    'category': "//div[@id='left_']/div[1]/a",
    'description':
    "//div[@class='tabcontentstyle']/div[@id='tab2']/div[@id='stcpDiv']",
    'images': "//div[@class='clearfix chitietsp']/img/@data-large",
    'canonical': "",
    'base_url': "",
    'brand': "",
    'in_stock': "",
    'guarantee': "",
    'promotion': ""
}
name = 'puritanpride.vn'
allowed_domains = ['puritanpride.vn']
start_urls = ['http://www.puritanpride.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = ['']
rules = [
    Rule(LinkExtractor(allow=['/detail-\d+-[a-zA-Z0-9-]+\.html$']),
         'parse_item'),
    Rule(LinkExtractor(allow=['/catergories3-\d+-[a-zA-Z0-9-]+\.html']),
         'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
class FacultyPagesFilteredSpider(scrapy.Spider):
    name = 'faculty_pages_filtered'
    allowed_domains = [
        'cmu.edu', 'cornell.edu', 'washington.edu', 'gatech.edu',
        'princeton.edu', 'utexas.edu', 'illinois.edu', 'berkeley.edu'
        'mit.edu', 'stanford.edu'
    ]
    count = 0
    record = {}
    start_urls = [
        'https://www.cmu.edu/', 'https://www.cornell.edu/',
        'https://www.washington.edu/', 'https://www.gatech.edu/',
        'https://www.princeton.edu/', 'https://www.utexas.edu/',
        'https://illinois.edu/', 'https://www.berkeley.edu/',
        'https://www.mit.edu/', 'https://www.stanford.edu/'
    ]

    exclude_words = [
        'news', 'events', 'publications', 'pub', 'gallery', 'category',
        'courses', 'students', 'references', 'reference', 'software',
        'softwares', 'tags', 'tutorials', 'workshop', 'festival', 'admissions',
        'exhibitions', 'alumni', 'lectures', 'undergraduate', 'about',
        'history', 'awards', 'ranking', 'enrollment', 'graduate', 'archive',
        'stories', 'post', 'pages', 'magazine', 'curriculum', '404', 'faqs',
        'engage', 'campaign', 'career', 'resources', 'services', 'network',
        'security', 'donate', 'giving', 'finance', 'forms', 'policies',
        'policy', 'alphabetical', 'summer', 'winter', 'spring', 'autumn',
        'fall', 'health', 'facilities', 'facility', 'wp', 'information',
        'general', 'catalog', 'guides', 'library', 'publish', 'blog',
        'collection', 'share', 'search', 'periodicals', 'bookstore', 'store',
        'product', 'organisation', 'webstore', 'funding', 'pdf'
    ]

    rules = [Rule(LinkExtractor(unique=True), callback='parse', follow=True)]

    #count_limits = {"page_count": 200, "item_count": 200}

    def __init__(self):

        self.tree = Tree()
        self.tree.create_node("root", "root")
        self.tree.create_node("unknown", "unknown", parent="root")

        self.bio_identifier = BioIdentifier(model="bio-model")

        for dom in self.allowed_domains:
            domain = dom.split('.')[0]
            if not os.path.exists('Crawled_Data'):
                os.makedirs('Crawled_Data')

            folder_name = 'Crawled_Data/' + domain.capitalize(
            ) + '_University_Files'
            self.record[domain] = 0
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)

    def parse(self, response):

        matched_domain = [x for x in self.allowed_domains if x in response.url]
        if len(matched_domain) > 0:
            domain = matched_domain[0].split('.')[0]

            folder_name = 'Crawled_Data/' + domain.capitalize(
            ) + '_University_Files'

            self.record[domain] = self.record.get(domain, 0) + 1

            if self.record[domain] % 50 == 0:
                print('\n Crawled {} Bio-pages of {} University ...'.format(
                    self.record[domain], domain.capitalize()))
                self.tree.save2file(folder_name + "/00__" +
                                    str(self.record[domain]) + "_tree.txt")

            isBio = self.bio_identifier.is_bio_html_content(
                response.xpath('//*').get())

            if isBio:
                text = BeautifulSoup(response.xpath('//*').get(),
                                     features="html.parser").get_text()
                tokens = nltk.word_tokenize(text)
                normalized_text = ' '.join(
                    [word for word in tokens if word.isalnum()])
                normalized_text += '\n' + response.url

                hash_text = hashlib.md5(response.url.encode())
                file_name = hash_text.hexdigest()

                with open(folder_name + "/" + file_name + ".txt",
                          "w",
                          encoding="utf-8") as file:
                    file.write(normalized_text)

            AllLinks = LinkExtractor(allow_domains=domain + '.edu',
                                     unique=True).extract_links(response)

            for n, link in enumerate(AllLinks):
                if not any([x in link.url for x in self.exclude_words]):
                    if self.tree.get_node(link.url) == None:
                        referer = response.request.headers.get('Referer', None)

                        if referer == None:
                            self.tree.create_node(link.url,
                                                  link.url,
                                                  parent='root')
                        else:
                            referer = referer.decode("utf-8")
                            if self.tree.contains(referer):

                                self.tree.create_node(link.url,
                                                      link.url,
                                                      parent=referer)
                            else:
                                self.tree.create_node(link.url,
                                                      link.url,
                                                      parent='unknown')

                        yield scrapy.Request(url=link.url, callback=self.parse)
Пример #22
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//div[@class='infomation_detail infomation']/h1/strong/span",
    'price':
    "//td[@class='voucher_td']/p[@class='price_room']/span[@class='price_show']",
    'category': "//ul[@class='main_header_ul']/li/a/span",
    'description': "//div[@class='cont_toggle']/div[@id='detail_description']",
    'images':
    "//td[@id='td_show_img']/img/@src | //div[@class='sliderkit-nav-clip']/ul/li/a/img/@data-large",
    'canonical': "//link[@rel='canonical']/@href",
    'base_url': "",
    'brand': ""
}
name = 'mytour.vn'
allowed_domains = ['mytour.vn']
start_urls = ['http://mytour.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/\d+-[a-zA-Z0-9-]+\.html']), 'parse_item'),
    Rule(LinkExtractor(allow=['/c+\d+/.+\.html($|\?page=\d+$)']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Пример #23
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//div[@class='content-info clearfix']//h1",
    'price' : "//div[@class='bigprice']",
    'category' : "//div[@class='dieuhuong fl']/a",
    'description' : "//div[@class='sp-tab clearfix']/div[@class='content']",
    'images' : "//div[@class='main-img']/a[@class='fancy']/img/@src",
    'canonical' : "//link[@rel='canonical']/@href",
    'base_url' : "",
    'brand' : "",
    'in_stock' : "",
    'guarantee' : "",
    'promotion' : ""
}
name = 'dieuhoabonmua.vn'
allowed_domains = ['dieuhoabonmua.vn']
start_urls = ['http://dieuhoabonmua.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = ['']
rules = [
    Rule(LinkExtractor(allow=['/[\w-]+\.html$']), 'parse_item'),
    Rule(LinkExtractor(allow=['/[\w-]+/($|page/\d+/$)']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Пример #24
0
class GoogleSpider(CrawlSpider):
    name = "google"
    allowed_domains = ["play.google.com"]
    start_urls = [
        'https://play.google.com/store',
        #'https://play.google.com/store/apps/category/GAME/collection/topselling_free',
        'https://play.google.com/store/apps/details?id=com.viber.voip'
    ]

    rules = [
        Rule(LinkExtractor(
            allow=("https://play\.google\.com/store/apps/details", )),
             callback='parse_app',
             follow=True),
    ]  # CrawlSpider 会根据 rules 规则爬取页面并调用函数进行处理

    def parse_app(self, response):
        # 在这里只获取页面的 URL 以及下载数量
        item = GoogleplayItem()
        item['url'] = response.url

        app_id = response.url.split('=')[-1]
        if app_id:
            item['app_id'] = app_id
        else:
            item['app_id'] = ''

        rate_count = response.xpath('//span[@class="rating-count"]/text()')
        if rate_count:
            rate_count = rate_count.extract()[0].strip().replace(',', '')
            item['rating_count'] = rate_count

        # app_name_div = response.xpath('//div[@class="id-app-title"]/text()')
        # if not app_name_div:
        #     logging.error(msg='not find the app name')
        #     return
        # item['app_name'] = app_name_div.extract()[0].strip()

        # mail_a = response.xpath(
        #     '//div[@class="content contains-text-link"]/a[2]/@href')
        # if not mail_a:
        #     return
        #
        # mail_text = mail_a.extract()[0]
        # if 'mailto:' in mail_text:
        #     mail_text = mail_text.replace('mailto:', '')
        # item['mail'] = mail_text.strip()
        #
        # company_name_span = response.xpath('//span[@itemprop="name"]/text()')
        # if not company_name_span:
        #     return
        #
        # company_name = company_name_span.extract()[0].strip()
        # item['company_name'] = company_name
        #
        # download_count = response.xpath(
        #     '//div[@itemprop="numDownloads"]/text()')
        # if download_count:
        #     item['download_count'] = download_count.extract()[0].strip()
        # else:
        #     item['download_count'] = '0'

        yield item
Пример #25
0
class NewZimukuCrawler(scrapy.Spider):
    name = "new_zimuku_crawler"
    allowed_domains = [
        "zimuku.la",
        "subku.net",
    ]

    # List to hold all search result pages
    url_list = [
        "https://www.zimuku.la/search?q=&p=1",
        "https://www.zimuku.la/search?q=&p=2",
    ]

    # Generate search result urls automatically
    counter = 3
    while counter <= 2369:
        url_list.append("https://www.zimuku.la/search?q=&p=" + str(counter))
        counter += 1

    print("[INFO]\tURL list generated.")

    start_urls = url_list

    rules = (Rule(LinkExtractor(allow=('\.htm'))), )

    def parse(self, response):
        # Find containers for download page link and file name
        containers = response.selector.xpath(
            '//div[contains(@class, "item prel")]/div[contains(@class, "title")]/div/table/tbody/tr/td[contains(@class, "first")]'
        )

        # Go through all containers
        for container in containers:
            # Get file name for that specific file
            file_name = container.xpath('a/@title')[0].extract()

            # Assign file name to new item
            item = NewZimukuCrawlerItem()
            item['file_name'] = file_name

            # Get link to download page
            href = container.xpath('a/@href')[0].extract()

            # Go to download page
            url = response.urljoin(href)
            request = scrapy.Request(url, callback=self.parse_detail)
            request.meta['item'] = item

            yield request

    # Download page for a specific subtitle
    def parse_detail(self, response):
        # Get link to provider selection page
        url = response.selector.xpath(
            '//li[contains(@class, "dlsub")]/div/a[contains(@id, "down1")]/@href'
        ).extract()[0]

        # Go to provider selection page
        request = scrapy.Request(url, callback=self.parse_download)
        request.meta['item'] = response.meta['item']

        yield request

    # Webpage that opens to select provider
    def parse_download(self, response):
        # Get url to actual file download
        url = response.selector.xpath(
            '//div[contains(@class, "down")]/ul/li/a/@href').extract()[4]

        # Download file
        request = scrapy.Request(url, callback=self.parse_file)
        request.meta['item'] = response.meta['item']

        yield request

    def parse_file(self, response):
        body = response.body
        item = response.meta['item']
        item['body'] = body

        return item
Пример #26
0
class CaijingSpider(CrawlSpider):
    name = 'caijing'
    source = "财经网"
    allowed_domains = ["caijing.com.cn"]
    yesterday = datetime.date.today() - datetime.timedelta(days=1)
    yesterday = yesterday.strftime('%Y%m%d')
    reg = yesterday
    start_urls = ['http://industry.caijing.com.cn/industrianews/']
    rules = (
        Rule(LinkExtractor(allow=reg), callback="parse_news", follow=True),
        Rule(LinkExtractor(allow='industrianews/[2-4].shtml')),
        # Rule(LinkExtractor(allow='industrianews/[0-9].shtml')),
    )

    def printcn(uni):
        for i in uni:
            print uni.encode('utf-8')

    def parse_news(self, response):
        item = GenericItem()
        self.get_id(response, item)
        self.get_url(response, item)
        self.get_source(response, item)
        self.get_title(response, item)
        self.get_date(response, item)
        self.get_body(response, item)
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse
        if item['body']:
            return item

    def get_id(self, response, item):
        id = uuid.uuid4()
        if id:
            item['id'] = id

    def get_url(self, response, item):
        news_url = response.url
        if news_url:
            item['url'] = news_url

    def get_source(self, response, item):
        source = self.source
        if source:
            item['source'] = source

    def get_title(self, response, item):
        title = response.xpath('//*[@id="cont_title"]/text()').extract()
        if title:
            item['title'] = ''.join(title).strip()

    def get_date(self, response, item):
        date = response.xpath('//span[@id="pubtime_baidu"]/text()').extract()
        if date:
            item['date'] = ''.join(date).replace(u'-', u'').replace(
                u':', u'').replace(u' ', u'').strip()
        else:
            date = response.xpath('//span[@id="cont_riqi"]/text()').extract()
            if date:
                item['date'] = ''.join(''.join(date).replace(
                    u'年', u'').replace(u'月', u'').replace(u'日', u'').replace(
                        u':', u'').replace(u' ', u'').strip()[-12:]) + '00'

    def get_body(self, response, item):
        paras = response.xpath('//div[@id="the_content"]/p')
        news_body = ''
        for p in paras:
            data = p.xpath('string(.)').extract()
            if data:
                body = ''
                for line in ''.join(data).splitlines():
                    #   print entry.encode('utf-8')
                    body += line.strip()
                news_body += body + '_|_'
        item['body'] = news_body.replace('_|__|_', '_|_')
Пример #27
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//h1",
    'price': "//ul[@class='list-unstyled']/li/h2",
    'category': "//ul[@class='breadcrumb']/li/a",
    'description': "//div[@class='tab-content']/div[@class='tab-pane active']",
    'images': "//ul[@class='thumbnails']/li/a/@href",
    'canonical': "//link[@rel='canonical']/@href",
    'base_url': "//base/@href",
    'brand': ""
}
name = 'hethongtongdai.vn'
allowed_domains = ['hethongtongdai.vn']
start_urls = ['http://hethongtongdai.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/shop/[a-zA-Z0-9-/]+\.html$']), 'parse_item'),
    Rule(LinkExtractor(allow=['/shop/[a-zA-Z0-9-]+($|\?page=\d+$)']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Пример #28
0
class P5wSpider(CrawlSpider):
    name='p5w'
    source = "全景网"
    allowed_domains = ["p5w.net"]
    yesterday = datetime.date.today() - datetime.timedelta(days=1)
    yesterday = yesterday.strftime('%Y%m%d')
    reg=yesterday
    start_urls = [
        'http://www.p5w.net/news/gncj',
        'http://www.p5w.net/news/gjcj',
        'http://www.p5w.net/news/cjxw',
        'http://www.p5w.net/news/xwpl',
        'http://www.p5w.net/news/biz',
        'http://www.p5w.net/news/cjxw/fdcy',
        'http://www.p5w.net/news/cjxw/zzyjxsbyb',
        'http://www.p5w.net/news/tech',
        'http://www.p5w.net/news/travel',
        'http://www.p5w.net/news/pgt',
        'http://www.p5w.net/news/sjqx',
        'http://www.p5w.net/news/gncj/index_2.htm',
        'http://www.p5w.net/news/gjcj/index_2.htm',
        'http://www.p5w.net/news/cjxw/index_2.htm',
        'http://www.p5w.net/news/xwpl/index_2.htm',
        'http://www.p5w.net/news/biz/index_2.htm',
        'http://www.p5w.net/news/cjxw/fdcy/index_2.htm',
        'http://www.p5w.net/news/cjxw/zzyjxsbyb/index_2.htm',
        'http://www.p5w.net/news/tech/index_2.htm',
        'http://www.p5w.net/news/travel/index_2.htm',
        'http://www.p5w.net/news/pgt/index_2.htm',
        'http://www.p5w.net/news/sjqx/index_2.htm',
        'http://www.p5w.net/news/gncj/index_3.htm',
        'http://www.p5w.net/news/gjcj/index_3.htm',
        'http://www.p5w.net/news/cjxw/index_3.htm',
        'http://www.p5w.net/news/xwpl/index_3.htm',
        'http://www.p5w.net/news/biz/index_3.htm',
        'http://www.p5w.net/news/cjxw/fdcy/index_3.htm',
        'http://www.p5w.net/news/cjxw/zzyjxsbyb/index_3.htm',
        'http://www.p5w.net/news/tech/index_3.htm',
        'http://www.p5w.net/news/travel/index_3.htm',
        'http://www.p5w.net/news/pgt/index_3.htm',
        'http://www.p5w.net/news/sjqx/index_3.htm',
        'http://www.p5w.net/news/gncj/index_4.htm',
        'http://www.p5w.net/news/gjcj/index_4.htm',
        'http://www.p5w.net/news/cjxw/index_4.htm',
        'http://www.p5w.net/news/xwpl/index_4.htm',
        'http://www.p5w.net/news/biz/index_4.htm',
        'http://www.p5w.net/news/cjxw/fdcy/index_4.htm',
        'http://www.p5w.net/news/cjxw/zzyjxsbyb/index_4.htm',
        'http://www.p5w.net/news/tech/index_4.htm',
        'http://www.p5w.net/news/travel/index_4.htm',
        'http://www.p5w.net/news/pgt/index_4.htm',
        'http://www.p5w.net/news/sjqx/index_4.htm',
        'http://www.p5w.net/news/gncj/index_5.htm',
        'http://www.p5w.net/news/gjcj/index_5.htm',
        'http://www.p5w.net/news/cjxw/index_5.htm',
        'http://www.p5w.net/news/xwpl/index_5.htm',
        'http://www.p5w.net/news/biz/index_5.htm',
        'http://www.p5w.net/news/cjxw/fdcy/index_5.htm',
        'http://www.p5w.net/news/cjxw/zzyjxsbyb/index_5.htm',
        'http://www.p5w.net/news/tech/index_5.htm',
        'http://www.p5w.net/news/travel/index_5.htm',
        'http://www.p5w.net/news/pgt/index_5.htm',
        'http://www.p5w.net/news/sjqx/index_5.htm'
    ]
    rules=(
        Rule(LinkExtractor(allow=reg), callback="parse_news", follow=True),
    )
    def printcn(uni):
        for i in uni:
            print uni.encode('utf-8')
    def parse_news(self,response):
        item = GenericItem()
        self.get_id(response,item)
        self.get_url(response,item)
        self.get_source(response,item)
        self.get_title(response,item)
        self.get_date(response,item)
        self.get_body(response,item)
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse
        if item['body']:
            return item

    def get_id(self,response,item):
        id=uuid.uuid4()
        if id:
            item['id']=id
    def get_url(self,response,item):
        news_url=response.url
        if news_url:
            item['url']=news_url
    def get_source(self,response,item):
        source=self.source
        if source:
            item['source']=source
    def get_title(self,response,item):
        title=response.xpath('//div[@class="newscontent_right2"]/h1/text()').extract()
        if title:
            item['title']=''.join(title).strip()
    def get_date(self,response,item):
        date=response.xpath('//div[@class="content_info clearfix"]/span[1]/time/text()').extract()
        if not date:
            date = response.xpath('//span[@id="dTime"]/text()').extract()
        if date:
            item['date']=datetime.date.today().strftime('%Y')+''.join(date).replace(u'月',u'').replace(u'日',u'').replace(u':',u'').replace(u' ',u'').strip()+'00'
    def get_body(self,response,item):
        paras = response.xpath('//div[@class="article_content2"]/div/p')
        if not paras:
            paras = response.xpath('//div[@class="Custom_UnionStyle"]/p')
        news_body = ''
        for p in paras:
            data = p.xpath('string(.)').extract()
            if data:
                body = ''
                for line in ''.join(data).splitlines():
                    #   print entry.encode('utf-8')
                    body += line.strip()
                news_body += body + '_|_'
        item['body'] = news_body.replace('_|__|_','_|_')
Пример #29
0
class lagouSpider(CrawlSpider):
    name = 'lagou'
    allowed_domains = ['www.lagou.com']
    start_urls = ['https://www.lagou.com/']
    agent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/" \
            "537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"

    # if not settings, it will be redirect to login
    custom_settings = {
        "COOKIES_ENABLED": False,
        "DOWNLOAD_DELAY": 1,
        'DEFAULT_REQUEST_HEADERS': {
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.8',
            'Connection':
            'keep-alive',
            'Cookie':
            'JSESSIONID=ABAAABAAAFCAAEGBC99154D1A744BD8AD12BA0DEE80F320; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; _ga=GA1.2.1111395267.1516570248; _gid=GA1.2.1409769975.1516570248; user_trace_token=20180122053048-58e2991f-fef2-11e7-b2dc-525400f775ce; PRE_UTM=; LGUID=20180122053048-58e29cd9-fef2-11e7-b2dc-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; X_HTTP_TOKEN=7e9c503b9a29e06e6d130f153c562827; _gat=1; LGSID=20180122055709-0762fae6-fef6-11e7-b2e0-525400f775ce; PRE_HOST=github.com; PRE_SITE=https%3A%2F%2Fgithub.com%2Fconghuaicai%2Fscrapy-spider-templetes; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2F4060662.html; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1516569758,1516570249,1516570359,1516571830; _putrc=88264D20130653A0; login=true; unick=%E7%94%B0%E5%B2%A9; gate_login_token=3426bce7c3aa91eec701c73101f84e2c7ca7b33483e39ba5; LGRID=20180122060053-8c9fb52e-fef6-11e7-a59f-5254005c3644; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1516572053; TG-TRACK-CODE=index_navigation; SEARCH_ID=a39c9c98259643d085e917c740303cc7',
            'Host':
            'www.lagou.com',
            'Origin':
            'https://www.lagou.com',
            'Referer':
            'https://www.lagou.com/',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
    }

    rules = (
        # Rule(LinkExtractor(allow=('zhaopin/.*',)), follow=True),
        # Rule(LinkExtractor(allow=('gongsi/j\d+.html',)), follow=True),
        Rule(LinkExtractor(allow=r'jobs/\d+.html'),
             callback='parse_job',
             follow=True), )

    def parse_job(self, response):
        """
        解析拉勾网的职位
        :param response:
        :return:
        """
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary_min", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years_min",
                              "//*[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']/p/span[5]/text()")

        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item
Пример #30
0
class ScrutinSpider(BaseSpider):
    name = "scrutinspider"

    rules = [
        Rule(LinkExtractor(allow=['/scrutins/liste/.*']),
             'parse_an_scrutins', follow=True),
        Rule(LinkExtractor(allow=['/scrutin-public/scr\d+.html']),
             'parse_senat_session', follow=True)
    ]

    start_urls = [
        'http://www2.assemblee-nationale.fr/scrutins/liste/',
        'http://www.senat.fr/seancepub.html'
    ]

    def parse_an_scrutins(self, response):
        for scrutin in response.xpath('//table[@class="scrutins"]/tbody/tr'):
            item = ScrutinItem()
            item['chambre'] = 'AN'
            item['numero'] = self.get_text(scrutin, 'td[1]').rstrip('*')
            item['objet'] = self.get_text(scrutin, 'td[3]').strip(
                ' [').capitalize()
            item['url'] = self.make_url(response, scrutin.select(
                'td/a[contains(text(), "analyse")]/@href')[0].extract())

            matches = re.search('(\d{1,2})/(\d{1,2})/(\d{1,4})',
                                self.get_text(scrutin, 'td[2]'))
            item['date'] = '-'.join((matches.group(3), matches.group(2),
                                     matches.group(1)))

            try:
                item['dossier_url'] = self.make_url(response, scrutin.select(
                    'td/a[contains(text(), "dossier")]/@href')[0].extract())
            except IndexError:
                pass

            yield item

    def parse_senat_session(self, response):
        for bloc in response.xpath('//div[@class="blocscr"]'):
            href = bloc.xpath('span[@class="blocscrnr"]/a/@href')[0].extract()
            dlink = bloc.xpath(
                '//a[contains(@href, "/dossier-legislatif/")]/@href')

            req = Request(url=self.make_url(response, href),
                          callback=self.parse_senat_scrutin)
            if len(dlink):
                req.meta['dlink'] = dlink[0].extract()

            yield req

    def parse_senat_scrutin(self, response):
        item = ScrutinItem()
        item['chambre'] = 'SEN'

        titlediv = response.xpath('//div[@class="title"]')[0]
        title = self.get_text(titlediv, 'h1')

        matches = re.search(ur'scrutin-public/(\d+)/scr.*\.html', response.url)
        session = matches.group(1)

        matches = re.search(ur'^Scrutin n° (\d+) - séance du (.*)$', title)
        item['numero'] = '%s-%s' % (session, matches.group(1))

        objet = self.get_text(response, '//div[@id="wysiwyg"]/p/i')
        item['objet'] = objet

        item['url'] = response.url

        dmatches = re.search(r'^(\d+) (\D+) (\d+)$', matches.group(2))
        item['date'] = '%04d-%02d-%02d' % (int(dmatches.group(3)),
                                           _months[dmatches.group(2)],
                                           int(dmatches.group(1)))

        if 'dlink' in response.meta:
            item['dossier_url'] = response.meta['dlink']
        else:
            dlink = response.xpath(
                '//a[contains(@href, "/dossier-legislatif/")]/@href')

            if len(dlink):
                item['dossier_url'] = self.make_url(response,
                    dlink[0].extract())

        yield item