def parse(self, response):
     # try:
     self.items = []
     html = etree.HTML(response.text)
     dls = html.xpath('//*/dd')
     for dl in dls[10:]:
         name = dl.xpath('./div[1]/div[2]/a/div/div[1]/span/text()')[0]
         type = dl.xpath('./div[1]/div[2]/a/div/div[2]/text()')[1].strip()
         time = dl.xpath('./div[1]/div[2]/a/div/div[4]/text()')[1].strip()
         # //*[@id="app"]/div/div[2]/div[2]/dl/dd[1]/div[1]/div[2]/a/div/div[1]/span[2]
         # //*[@id="app"]/div/div[2]/div[2]/dl/dd[1]/div[1]/div[2]/a/div/div[1]/span[2]/i[1]
         integer = dl.xpath(
             './div[1]/div[2]/a/div/div[1]/span[2]/i[1]/text()')[0]
         fraction = dl.xpath(
             './div[1]/div[2]/a/div/div[1]/span[2]/i[2]/text()')[0]
         grade = f'{integer}{fraction}'
         href = dl.xpath("./div[1]/a/@href")[0]
         item = SpidersItem()
         item["name"] = name
         item["type"] = type
         item["grade"] = grade
         item["time"] = time
         url = f'https://maoyan.com{href}'
         yield scrapy.Request(url=url,
                              meta={"item": item},
                              callback=self.parse_detail)
示例#2
0
 def parse2(self, response):
     try:
         # Selector
         print('parse2')
         movies = Selector(
             response=response).xpath('//div[@class="movie-hover-info"]')
         # print(len(movies))
         # 切片限定爬取数量Top10
         for movie in movies[0:10]:
             item = SpidersItem()
             # print(movie)
             movie_title = movie.xpath('./div[1]/@title').extract_first()
             # print(movie_title)
             movie_type = movie.xpath(
                 './div[2]/text()[2]').extract_first().strip()
             # print(movie_type)
             movie_showtime = movie.xpath(
                 './div[4]/text()[2]').extract_first().strip()
             # print(movie_showtime)
             item['mtitle'] = movie_title
             item['mtype'] = movie_type
             item['mshowtime'] = movie_showtime
             # print(item)
             yield item
     except Exception as e:
         print(e)
示例#3
0
 def parse(self, response):
     page = response.meta['page']
     page_movie_num = self.movie_num - (page - 1) * 30
     selector_info = Selector(response=response)
     try:
         for i, movie_block in enumerate(
                 selector_info.xpath('//div[@class="movie-hover-info"]')):
             if i == page_movie_num:
                 break
             movie_name = None
             movie_type = None
             movie_time = None
             item = SpidersItem()
             for movie_info in movie_block.xpath('./div'):
                 movie_name = movie_info.xpath('./@title').extract_first()
                 div_text = movie_info.xpath('./text()').extract()
                 span_text = movie_info.xpath(
                     './span/text()').extract_first()
                 if span_text == '类型:':
                     movie_type = div_text[1].strip()
                 elif span_text == '上映时间:':
                     movie_time = div_text[1].strip()
             item['movie_name'] = movie_name
             item['movie_type'] = movie_type
             item['movie_time'] = movie_time
             yield item
     except Exception as e:
         print(e)
示例#4
0
    def parse(self, response):
        print(response.url)

        movies = Selector(
            response=response).xpath("//div[@class='movie-hover-info']")
        for i in range(10):
            movie = movies[i].xpath(
                "./div[contains(@class,'movie-hover-title')]")
            movie_name = movie[0].xpath("./span[1]/text()")
            movie_type = movie[1].xpath("./text()")
            movie_time = movie[3].xpath("./text()")

            res_name = movie_name.get()
            res_type = movie_type.extract()[1].strip()
            res_time = movie_time.extract()[1].strip()
            print(res_name)
            print(res_type)
            print(res_time)

            item = SpidersItem()
            item['name'] = res_name
            item['m_type'] = res_type
            item['m_time'] = res_time

            yield item
示例#5
0
    def parse(self, response):
        item = SpidersItem()

        movies = response.xpath("//div[@class='info']")

        for each in movies:
            title = each.xpath('div[@class="hd"]/a/span[@class="title"]/text()'
                               ).extract_first()
            content = each.xpath('div[@class="bd"]/p/text()').extract()
            score = each.xpath(
                'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract_first()
            info = each.xpath('div[@class="bd"]/p[@class="quote"]/span/text()'
                              ).extract_first()

            item['title'] = title
            item['content'] = ';'.join(content)
            item['score'] = score
            item['info'] = info

            yield item

        if self.start <= 225:
            self.start += 25
            yield scrapy.Request(self.url + str(self.start) + self.end,
                                 callback=self.parse)
示例#6
0
    def parse(self, response):

        # bs_info = bs(response.text, 'html.parser')
        # moviedict=defaultdict(dict)
        # for tags in bs_info.find_all('div', attrs={'class': 'movie-hover-info'})[:10]:
        #     for atag in tags.find_all('div',attrs={'class':'movie-hover-title'}):
        #         name=atag.get('title')
        #         hovertag=atag.find('span')
        #         if hovertag:
        #             tagtype=hovertag.text.strip()
        #             if tagtype in ['上映时间:','类型:']:
        #                 txt=atag.text.strip().split('\n')[1].strip()
        #                 moviedict[name][tagtype]=txt
        # for name in moviedict:
        #     item=SpidersItem()
        #     item['name']=name
        #     item['movietype']=moviedict[name]['类型:']
        #     item['time']=moviedict[name]['上映时间:']
        #     #items.append(item)
        #     #print(name,moviedict[name]['类型:'],moviedict[name]['上映时间:'])
        #     yield item
        select = Selector(response=response)
        movies=select.xpath('//div[@class="movie-hover-info"]')
        for movie in movies[:10]:
            item=SpidersItem()
            name = movie.xpath('./div/@title')[0]
            movietype=movie.xpath('./div/text()')[4]
            time=movie.xpath('./div/text()')[-1]
            item['name']=name.extract()
            item['movietype']=movietype.extract().strip()
            item['time']=time.extract().strip()
            #print(name.extract(),movietype.extract().strip(),movie.xpath('./div/text()')[-1].extract())
            yield item
示例#7
0
 def parse(self, Response):
     print('scrapy is parsing')
     print(Response.url)
     items = []
     movies = Selector(
         response=Response).xpath('//div[@class="movie-hover-info"]')
     try:
         for movie in movies[:10]:
             name = movie.xpath(
                 './div[1]/span[@class="name "]/text()').get()
             movie_type = movie.xpath('./div[2]/text()')[1].get().strip()
             show_time = movie.xpath('./div[3]/text()')[1].get().strip()
             # print('name is',name)
             # print('movie_type is: ',movie_type)
             # print('show_time is ', show_time)
             item = SpidersItem()
             item['name'] = name
             item['movie_type'] = movie_type
             item['show_time'] = show_time
             items.append(item)
     except:
         print('an error occurred')
     finally:
         print('items are ', items)
     return items
示例#8
0
def get_spiders_item(sel, fields, item=None):
    if item is None:
        item = SpidersItem()

    for f in fields:
        if f['type'] == 'xpath':
            # xpath selector
            if f['extract_type'] == 'text':
                # text content
                query = f['query'] + '/text()'
            else:
                # attribute
                attribute = f["attribute"]
                query = f['query'] + f'/@("{attribute}")'
            item[f['name']] = sel.xpath(query).extract_first()

        else:
            # css selector
            if f['extract_type'] == 'text':
                # text content
                query = f['query'] + '::text'
            else:
                # attribute
                attribute = f["attribute"]
                query = f['query'] + f'::attr("{attribute}")'
            item[f['name']] = sel.css(query).extract_first()

    return item
示例#9
0
    def parse(self, response):
        # 打印网页的url
        print(response.url)
        # 打印网页的内容
        # print(response.text)

        # soup = BeautifulSoup(response.text, 'html.parser')
        # title_list = soup.find_all('div', attrs={'class': 'hd'})
        movies = Selector(response=response).xpath('//div[@class="hd"]')
        for i in range(10):
        #for movie in movies:
        #     title = i.find('a').find('span',).text
        #     link = i.find('a').get('href')
            # 路径使用 / .  .. 不同的含义 
            item = SpidersItem ()
            title = movies[i].xpath('./a/span/text()')
            link = movies[i].xpath('./a/@href')
            # print('---------')
            # print('num::{i}')
            # print(title)
            # # print(link)
            # print('----------')
            # title2 = title.extract()
            # print(f'title::{title2}')
            # print(f'link::{link.extract()}')
            # print(f'title_first:::{title.extract_first()}')
            # print(f'link_first:::{link.extract_first()}')
            # print(f'title_first:::::::::{title.extract_first().strip()}')
            # print(f'link_first::::::::::{link.extract_first().strip()}')
            item['title'] = title.extract_first().strip()
            item['link'] = link.extract_first().strip()
            yield scrapy.Request(url=link.extract_first().strip(), meta={'item': item}, callback=self.parse2)
示例#10
0
 def parse(self, response):
     # print(response.url)
     movies = Selector(response=response).xpath('//div[@class="hd"]')
     for movie in movies:
         titles = movie.xpath('./a/span/text()')
         links = movie.xpath('./a/@href')
         # 查看 mysql movie 表是否有数据
         # conn = pymysql.connect(host = '192.168.100.101',
         #            port = 3306,
         #            user = '******',
         #            password = '******',
         #            database = 'douban',
         #            charset = 'utf8mb4'
         #             )
         # try:
         #     with conn.cursor() as cursor:
         #         count = cursor.execute('select * from movie;')
         #         print(f'查询到 {count} 条记录')
         #         result = cursor.fetchone()
         #         print(result)
         # finally:
         #     conn.close()
         title = titles.extract_first().strip()
         link = links.extract_first().strip()
         item = SpidersItem()
         item['title'] = title
         item['link'] = link
         yield item
示例#11
0
    def parse(self, response):
        # 打印网页的url
        movies = Selector(
            response=response).xpath('//div[@class="movie-item film-channel"]')
        print('----0')
        print(len(movies))
        cookies = {
            'uuid': '66a0f5e7546b4e068497.1542881406.1.0.0',
            '_lxsdk_cuid':
            '1673ae5bfd3c8-0ab24c91d32ccc8-143d7240-144000-1673ae5bfd4c8',
            '__mta': '222746148.1542881402495.1542881402495.1542881402495.1',
            'ci': '20',
            'rvct': '20%2C92%2C282%2C281%2C1',
            '_lx_utm': 'utm_source%3DBaidu%26utm_medium%3Dorganic',
            '_lxsdk_s': '1674f401e2a-d02-c7d-438%7C%7C35'
        }

        for i in range(10):
            item = SpidersItem()
            link = movies[i].xpath('./a/@href').extract_first()
            print('----1')
            print(link)
            print('----2')
            link = "https://maoyan.com" + link
            print(link)
            yield scrapy.Request(url=link,
                                 meta={'item': item},
                                 cookies=cookies,
                                 callback=self.parse2)
示例#12
0
 def parse(self, response):
     movies = Selector(response=response).xpath('//div[@class="movie-hover-info"]')
     for movie in movies[:10]:
         item = SpidersItem()
         item['title'] = movie.xpath('./div[1]/span[1]/text()').extract_first()
         item['types'] = movie.xpath('./div[2]/text()').extract()[1].strip()
         item['date'] = movie.xpath('./div[4]/text()').extract()[1].strip()
         yield item
示例#13
0
 def parse(self, response):
     selector = lxml.etree.HTML(response.text)
     links_xpath = '//*[@class="hd"]/a/@href'
     links = selector.xpath(links_xpath)
     for link in links:
         item = SpidersItem()
         item['link'] = str(link)
         yield scrapy.Request(url=link, meta={'item': item}, callback=self.parse2)
示例#14
0
 def parse(self, response):
     items = []
     movies = Selector(response).xpath('//div[@class="hd"]')
     for movie in movies:
         item = SpidersItem()
         titleData = movie.xpath('./a/span/text()')
         linkData = movie.xpath('./a/@href')
         item['title'] = titleData.extract_first()
         item['link'] = linkData.extract_first()
         items.append(item)
         yield scrapy.Request(url=item['link'], meta = {'item': item}, callback=self.parse2)
示例#15
0
 def parse(self, response):
     movies = Selector(response=response).xpath(
         '//div[@class="channel-detail movie-item-title"]')
     for movie in movies[0:10]:
         item = SpidersItem()
         link_uri = movie.xpath('./a/@href').extract_first().strip()
         link = 'https://maoyan.com' + link_uri
         item['link'] = link
         yield scrapy.Request(url=link,
                              meta={'item': item},
                              callback=self.parse2)
示例#16
0
 def parse(self, response):
     movies = Selector(response=response).xpath('//div[@class="channel-detail movie-item-title"]')
     link = movies.response.xpath('./a/@href')
     for i in link:
         i1 = Selector(response=response).xpatn('//div[class="movie-brief-container"]')
         # 在items.py定义
         item = SpidersItem()
         title = i1.xpath('./div[1]/text()').get()
         ca = i1.xpath('./div[3]/text()').get().extract()
         date  = i1.xpath('./div[4]/text()').get()
         yield  item
示例#17
0
 def parse(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     title_list = soup.find_all('div', attrs={'class': 'hd'})
     for i in title_list:
         item = SpidersItem()
         title = i.find('a').find('span', ).text
         link = i.find('a').get('href')
         item['title'] = title
         item['link'] = link
         yield scrapy.Request(url=link,
                              meta={'item': item},
                              callback=self.parse2)
示例#18
0
 def parse(self, response):
     movies = Selector(
         response=response).xpath('//div[@class="movie-hover-info"]')[:10]
     for movie in movies:
         item = SpidersItem()
         item['movie_name'] = movie.xpath(
             './div[1]/span[1]/text()').extract_first()
         item['movie_type'] = movie.xpath(
             './div[2]/text()[2]').extract_first().strip()
         item['movie_time'] = movie.xpath(
             './div[4]/text()[2]').extract_first().strip()
         yield item
示例#19
0
 def parse_details(self, response):
     item = SpidersItem()
     selector = Selector(response=response)
     movie_brief = selector.xpath('//div[@class="movie-brief-container"]')
     title = movie_brief.xpath('./h1/text()').get()
     categories = movie_brief.xpath('.//a/text()').getall()
     categories = [category.strip() for category in categories]
     release_time = movie_brief.xpath('.//li[last()]/text()').get()
     item['title'] = title
     item['categories'] = categories
     item['release_time'] = release_time
     yield item
示例#20
0
 def parse(self, response):
     et_html = et.HTML(response.text)
     for selector in response.xpath(
             '//div[@class="movie-hover-info"]')[:10]:
         item = SpidersItem()
         item['film_name'] = selector.xpath(
             './div[1]/span[1]/text()').extract_first().strip()
         item['film_type'] = selector.xpath(
             './div[2]/text()[2]').extract_first().strip()
         item['file_date'] = selector.xpath(
             './div[4]/text()[2]').extract_first().strip()
         yield item
示例#21
0
 def parse(self, response):
     movie_list = Selector(
         response=response).xpath('//div[@class="movie-item film-channel"]')
     for i in range(10):
         movie = movie_list[i]
         item = SpidersItem()
         item['movie_name'] = movie.xpath(
             './/span[contains(@class,"name")]/text()').extract_first()
         type_date = movie.xpath(
             './/span[@class="hover-tag"]/../text()').extract()
         item['movie_type'] = type_date[1].strip('\n').strip()
         item['movie_time'] = type_date[5].strip('\n').strip()
         yield item
示例#22
0
 def parse(self, response):
     base_url = 'https://maoyan.com'
     movies = Selector(response=response).xpath(
         '//div[@class="channel-detail movie-item-title"]')
     for movie in movies:
         item = SpidersItem()
         title = movie.xpath('./a/text()')  # 电影名称
         link = movie.xpath('./a/@href')  # 链接
         item['film_name'] = title.extract_first().strip()
         item['link'] = base_url + link.extract_first().strip()  # 拼接完整的 url
         yield scrapy.Request(url=item['link'],
                              meta={'item': item},
                              callback=self.parse2)
示例#23
0
 def parse(self, response):
     selector = lxml.etree.HTML(response.text)
     links_xpath = '//*[@class="movie-item-hover"]/a/@href'
     links = selector.xpath(links_xpath)
     print('-------------------------')
     print(links)
     print('-------------------------')
     for link in links:
         item = SpidersItem()
         item['link'] = str('https://maoyan.com' + link)
         yield scrapy.Request(url=item['link'],
                              meta={'item': item},
                              callback=self.parse2)
示例#24
0
 def parse(self, response):
     select = Selector(response=response)
     #print(select)
     movies=select.xpath('//div[@class="movie-hover-info"]')
     for movie in movies[:10]:
         item=SpidersItem()
         name = movie.xpath('./div/@title')[0]
         movietype=movie.xpath('./div/text()')[4]
         time=movie.xpath('./div/text()')[-1]
         item['name']=name.extract()
         item['movie_type']=movietype.extract().strip()
         item['time']=time.extract().strip()
         print(name.extract(),movietype.extract().strip(),movie.xpath('./div/text()')[-1].extract())
         yield item
示例#25
0
 def parse(self, response):
     items = []
     soup = bs(response.text, 'html.parser')
     for index, tags in enumerate(soup.find_all('div', {'class': 'hd'})):
         for content in tags.find_all('a'):
             href = content.get('href')
             name = content.find('span').text
             # print(f'名称:{name} 电影链接:{href}')
             # 利用提供的Item
             item = SpidersItem()
             item["name"] = name
             item["href"] = href
             items.append(item)
     return items
示例#26
0
 def parse(self, response):
     items = []
     movies = Selector(
         response=response).xpath('//div[@class="movie-hover-info"]')
     for movie in movies:
         item = SpidersItem()
         title = movie.xpath('./div/span/text()').get().strip()
         info = movie.xpath('./div/text()')
         movie_type = info[4].get().strip()
         date = info[-1].get().strip()
         item['title'] = title
         item['movie_type'] = movie_type
         item['date'] = date
         items.append(item)
     return items
示例#27
0
    def parse(self, response):

        movies = Selector(response).xpath('//div[@class="movie-item-hover"]')
        i = 0
        for movie in movies:
            if (i > 9):
                break
            item = SpidersItem()
            title = movie.xpath(
                './a/div/div[1]/span[1]/text()').extract()[0].strip()
            t = movie.xpath('./a/div/div[2]//text()').extract()[2].strip()
            timet = movie.xpath('./a/div/div[4]//text()').extract()[2].strip()
            item = {'title': title, 'type': t, 'time': timet}
            i += 1
            yield item
示例#28
0
 def parse(self, response):
     movies = Selector(response=response).xpath('//div[@class="movie-hover-info"]')
     i = 0
     for movie in movies:
         if i > 9:
             break
         item = SpidersItem()
         film_name = movie.xpath('./div[1]/span/text()')
         film_type = movie.xpath('./div[2]/text()')
         plan_date = movie.xpath('./div[4]/text()')
         item['film_name'] = film_name.extract()[0].strip()
         item['film_type'] = film_type.extract()[1].strip()
         item['plan_date'] = plan_date.extract()[1].strip()
         i = i + 1
         yield item
示例#29
0
 def parse(self, response):
     item = SpidersItem()
     count = 0
     movies = Selector(response=response).xpath(
         '//div[@class="channel-detail movie-item-title"]')
     for movie in movies:
         url = f'https://maoyan.com' + movie.xpath(
             './a/@href').extract_first()
         #print('------------------------------------------------------')
         #print(url)
         if (count < 10):
             yield scrapy.Request(url=url,
                                  meta={'item': item},
                                  callback=self.parse2)
             count += 1
示例#30
0
    def parse(self, response):
        for movies in Selector(response=response).xpath(
                '//div[@class="movie-hover-info"]')[:10]:
            movie_name = movies.xpath('./div[1]/span[1]/text()').extract()[0]
            movie_type = movies.xpath(
                './div[2]/text()[2]').extract()[0].strip()
            release_time = movies.xpath(
                './div[4]/text()[2]').extract()[0].strip()

            content = SpidersItem()
            content['movie_name'] = movie_name
            content['movie_type'] = movie_type
            content['release_time'] = release_time

            yield content