示例#1
0
文件: movie.py 项目: 51fx/Python004
    def parse(self, response):
        # 解析页面
        movie_name_list = Selector(response=response).xpath(
            '//dd/div[1]/div[2]/a/div/div[1]/span[1]/text()').extract()
        movie_type_list = Selector(response=response).xpath(
            '//dd/div[1]/div[2]/a/div/div[2]/text()').extract()
        movie_time_list = Selector(response=response).xpath(
            '//dd/div[1]/div[2]/a/div/div[4]/text()').extract()

        # 去除换行、空格
        movie_type_list = eval(
            str(movie_type_list).replace(' ', '').replace('\\n', ''))
        movie_time_list = eval(
            str(movie_time_list).replace(' ', '').replace('\\n', ''))
        for ele in movie_type_list:
            if len(ele) == 0:
                movie_type_list.remove(ele)
                movie_time_list.remove(ele)

        items = []
        for index in range(10):
            item = Homework2Item(movie_name=movie_name_list[index],
                                 movie_type=movie_type_list[index],
                                 movie_time=movie_time_list[index])
            items.append(item)
        return items
示例#2
0
def scrapyContent(filename):
    body = open(filename, 'r').read()
    itemName = Selector(
        text=body).xpath("/html/body/div/div[3]//img//@alt").extract()
    imageUrlList = Selector(
        text=body).xpath('//img/@data-ks-lazyload').extract()
    itemUrlList = Selector(
        text=body).xpath("/html/body/div/div[3]//dt/a//@href").extract()
    #使用图片的名字可以防止一些空格(detail的a中的名字可能有很多空格)
    for i in itemName[0:60]:
        print(i.replace('\\"', ""))
    #保存图片,图片名字由数字命名,之后可以换成其他的名字
    j = 0
    """
    for i in imageUrlList:
        filename=i.replace('\\"',"")[2:]
        print("http://"+filename)
        saveImage("http://"+filename,"image/"+str(j)+".jpg")
        j+=1
    """
    for i in itemUrlList:
        if (i[2:4] != "//"):
            itemUrlList.remove(i)
    for i in itemUrlList[0:60]:
        itemUrl = "https:" + i.replace('\\"', "")
        print(itemUrl)
 def getAdvisorData(self,response):
     nameLink = Selector(text=response.text).xpath(self.nameAndLinks).extract()
     for m in [u'mw-redirect',"new"]:
         try:
             nameLink.remove(m);
         except:
             None;
     nameLinkSorted = [[j,i] for i,j in zip(nameLink[::2], nameLink[1::2])];
     return nameLinkSorted;
    def parse_comments(self, response):

        item = CommentsItem()
        item['id'] = response.meta['id']
        item['flag'] = response.meta['flag']
        item['author'] = []
        item['author_comment'] = []
        item['time'] = []

        text = response.text
        restojson = json.loads(text, encoding='utf-8')
        html = restojson['data']['html']
        html = html.split('\\n')
        html = ''.join(html)
        author_comments = Selector(
            text=html).xpath('//*[@class="WB_text"]').extract()

        for author_comment in author_comments:

            item['author'].append(
                Selector(text=author_comment).xpath('//a/text()').extract()[0])
            remove_author = Selector(
                text=author_comment).xpath('//a/text()').extract()[0]

            author_comment = dealcontent(author_comment)
            comment = Selector(text=author_comment).xpath('//text()').extract()
            comment.remove(remove_author)
            comment = ''.join(comment)

            while re.match(r'^ ', comment):
                comment = comment.strip(' ')

            item['author_comment'].append(comment)

        if item['flag'] == 'forwarded':
            item['time'] = Selector(text=html).xpath(
                '//*[@class="WB_from S_txt2"]/a/@title').extract()
        if item['flag'] == 'comment':
            item['time'] = Selector(text=html).xpath(
                '//*[@class="WB_from S_txt2"]/text()').extract()

        item['like_count'] = Selector(text=html).xpath(
            '////span[@node-type="like_status"]/em[2]/text()').extract()
        lens = len(item['like_count'])

        for i in range(0, lens):
            item['like_count'][i] = item['like_count'][i].replace('赞', '0')

        yield copy.deepcopy(item)