예제 #1
0
    def detailParse(self, response):

        item = response.meta['item']

        fcontent = Selector(response=response).xpath(
            '//span[@property="v:summary"]/text()')

        item['fcontent'] = fcontent.extract_first().strip()

        yield item
예제 #2
0
파일: spider.py 프로젝트: onetwo1/my
    def get_all_url(self):
        # 获取列表页所有的url
        zero_page_url = 'http://www.baixing.com/w/posts/myPosts/all?page=0'
        self.login()
        zero_page_response = self.session.get(zero_page_url,
                                              headers=self.headers)
        # 获取页码数
        page_num = Selector(zero_page_response).xpath(
            '//ol[@class="page-nav"]/li[last()-1]/a/text()')
        if page_num:
            page_num = int(page_num.extract_first())
            print('请求第 -0- 页成功')
            self.file.write('请求 -0- 页成功\n')
        # 获取当前页面所有的url
        zero_id_list = Selector(zero_page_response).xpath(
            '//*[@id="posts-ad-items"]/li/a/@href').extract()
        for id_ in zero_id_list:
            self.file.write(id_)
            self.file.write('\n')

        url_format = slice(0, -1)
        init_url = zero_page_url[url_format]
        for index in range(1, page_num):
            url_ = init_url + str(index)
            # 进行请求
            res = self.session.get(url_, headers=self.headers)
            # 解析url
            if res.status_code == 200:
                print('请求 -{}- 页成功'.format(index))
                id_list = Selector(res).xpath(
                    '//*[@id="posts-ad-items"]/li/a/@href').extract()
                # 写入文件
                self.file.write('请求 -{}- 页成功'.format(index))
                self.file.write('\n')
                for id_ in id_list:
                    self.file.write(id_)
                    self.file.write('\n')
            else:
                print('请求 -{}- 页失败'.format(index))
                self.file.write('请求 -{}- 页失败'.format(index))
                self.file.write('\n')
예제 #3
0
def get_userItem(body, user_bit):
    """
    :type body: str
    :param body: 用户主页网页
    :return:item: 用户信息item
            concern_link: 所关注用户的链接
            fans_link: 粉丝的链接
    """
    item = items.userItem()

    # 获取ID
    user_id = int(get_userID_from_mainPage(body))
    item['id'] = user_id

    # 获取用户名
    p_name = re.compile(r'\$CONFIG\[\'onick\'\]=\'(.*)\'')
    try:
        name_ = p_name.search(body).group(1)
        item['name'] = name_
    except:
        item['name'] = ''

    # 是否是加V用户
    script_heads = Selector(text=body).xpath('//script/text()').extract()
    head_body = ''
    content_body = ''
    num_body = ''
    relation_body = ''

    for script_head in script_heads:
        p_head = re.compile(r'^\s*FM\.view\(\{(.*)\}\)', re.M)
        p_head_s = p_head.search(
            script_head.replace('\n', '').replace('\t', ''))

        if p_head_s:
            json_head = '{' + p_head_s.group(1) + '}'
            data = json.loads(json_head)
            if data.get('domid'):
                if 'Pl_Official_Headerv6' in data.get(
                        'domid'):  # 从该段中获取是否加V和性别信息
                    head_body = data.get('html')
                elif 'Pl_Core_UserInfo' in data.get(
                        'domid'):  # 从该段中获取加V描述,地址,生日
                    content_body = data.get('html')
                elif 'Pl_Core_T8CustomTriColumn' in data.get('domid'):
                    num_body = data.get('html')
                elif 'Pl_Core_UserGrid' in data.get('domid'):
                    relation_body = data.get('html')
    if head_body:
        v = Selector(text=head_body).xpath(
            "//em[@class='W_icon icon_pf_approve']").extract_first()
        v_co = Selector(text=head_body).xpath(
            "//em[contains(@class, 'icon_pf_approve_co')]").extract_first()
        if v:
            item['is_v'] = 1  # 个人
        elif v_co:
            item['is_v'] = 2  # 机构
        else:
            item['is_v'] = 0

        # 获取性别
        female = Selector(text=head_body).xpath(
            "//i[@class='W_icon icon_pf_female']").extract_first()
        male = Selector(text=head_body).xpath(
            "//i[@class='W_icon icon_pf_male']").extract_first()
        if female:
            item['sex'] = 1  # 女
        elif male:
            item['sex'] = 2  # 男
        else:
            item['sex'] = 0  # 出错或未知

        # 获取加V描述
        v_des = Selector(
            text=head_body).xpath("//div[@class='pf_intro']/text()")
        if v_des:
            item['v_des'] = v_des.extract_first().strip()
        else:
            item['v_des'] = ''
    else:
        item['is_v'] = -1  # 出错
        item['sex'] = 0
        item['v_des'] = ''

    # 获得微博等级
    if content_body:
        level_ = Selector(text=content_body).xpath(
            "//a[contains(@class,'W_icon_level')]/span/text()").extract_first(
            )
        if level_:
            item['level'] = int(level_.strip('Lv. '))
        else:
            item['level'] = -1

        # 获取地址和生日
        item['address'] = ''
        item['birthday'] = ''
        contents = Selector(text=content_body) \
            .xpath("//div[@class='detail']/ul[@class='ul_detail']/li[@class='item S_line2 clearfix']")
        for content in contents:
            icon = content.xpath(
                "./span[contains(@class, 'item_ico W_f')]/em/@class"
            ).extract_first()
            if icon == 'W_ficon ficon_cd_place S_ficon':
                addr = content.xpath(
                    "./span[contains(@class, 'item_text W_f')]/text()"
                ).extract_first().strip()
                item['address'] = addr
            elif icon == 'W_ficon ficon_constellation S_ficon':
                birthday = content.xpath(
                    "./span[contains(@class, 'item_text W_f')]/text()"
                ).extract_first().strip()
                item['birthday'] = birthday
    else:
        item['level'] = -1
        item['address'] = ''
        item['birthday'] = ''

    # 获取关注、粉丝、微博的数量
    concern_link = ''
    fans_link = ''

    if num_body:
        num_infos = Selector(
            text=num_body).xpath("//a[@class='t_link S_txt1']")
        has_link = False
        if num_infos:
            text_xpath = "./span[@class='S_txt2']/text()"
            num_xpath = "./strong[contains(@class, 'W_f')]/text()"
            has_link = True
        else:
            num_infos = Selector(text=num_body).xpath("//td[@class='S_line1']")
            text_xpath = "./span[@class='S_txt2']/text()"
            num_xpath = "./strong[contains(@class,'W_f')]/text()"

        for num_info in num_infos:
            info = num_info.xpath(text_xpath).extract_first()
            num = num_info.xpath(num_xpath).extract_first()
            if num:
                num = int(num.strip())
            else:
                num = -1

            if info == '关注':
                item['concern_num'] = num
                if has_link:
                    concern_link = num_info.xpath("./@href").extract_first()
            elif info == '粉丝':
                item['fans_num'] = num
                if has_link:
                    fans_link = num_info.xpath("./@href").extract_first()
            elif info == '微博':
                item['blog_num'] = num

        # 从微关系块获得关注、粉丝的链接信息
        if not has_link:
            relation_infos = Selector(text=relation_body) \
                .xpath(
                "//div[@class='obj_name']/h2[contains(@class, 'main_title W_fb W_f')]/a[contains(@class, 'S_txt')]")
            for relation in relation_infos:
                link = relation.xpath("./@href").extract_first()
                text = relation.xpath('./text()').extract_first()
                if '关注' in text:
                    concern_link = link
                elif '粉丝' in text:
                    fans_link = link
    else:
        item['concern_num'] = -1  # 未知或出错
        item['fans_num'] = -1
        item['blog_num'] = -1

    return item, concern_link, fans_link