예제 #1
0
 def parse_city_life(self, response):
     city = response.meta['city'].strip('/')
     # print "city:", city
     data = Selector(response).xpath(
         '//div[@class="hd"]/h1/span/a/text()').extract()[0].strip()
     data = data.strip(u'家')
     print u'{},{},{}'.format('life', city, data)
예제 #2
0
 def parse_post(self, response):
     crawl_time = None
     item = JishuxItem()
     item['post_url'] = response.meta['item']['post_url']
     item['post_title'] = response.meta['item']['post_title']
     item['_id'] = response.meta['item']['_id']
     item['post_type'] = response.meta['item']['post_type']
     conf = response.meta['conf']
     # post_time = re.search(
     #     '(20\d{2}([.\-/|年月\s]{1,3}\d{1,2}){2}日?(\s\d{2}:\d{2}(:\d{2})?)?)|(\d{1,2}\s?(分钟|小时|天)前)',
     #     response.text)
     # if post_time:
     #     crawl_time = generate_timestamp(post_time.group())
     content_html = get_summary(response, conf)
     content_text = Selector(text=content_html).xpath('string(.)').extract_first()
     content_text = content_text.strip().replace('\r', '').replace('\n', '').replace('\t', '')
     description = get_description(content_text)
     keywords = get_keywords(response, content_text)
     item['content_html'] = content_html
     item['description'] = description
     item['keywords'] = keywords
     item['crawl_time'] = crawl_time if crawl_time else int(time.time())
     item['cn_name'] = conf['cn_name']
     item['author'] = ''  # todo 文章作者 配置文件需要适配
     yield item if not start_urls_config.get('debug') else print(item)
예제 #3
0
def get_text(content):
    html = get_summary(content)
    content_text = Selector(text=html).xpath('string(.)').extract_first()
    content_text = content_text.strip().replace('\r', '').replace('\n',
                                                                  '').replace(
                                                                      '\t', '')
    return content_text
예제 #4
0
def extract_title(response):
    try:
        title = Selector(
            text=response.body).xpath('//title/text()').extract()[0]
        title = title.strip()
        return title
    except Exception as e:
        return ""
예제 #5
0
 def parse_city_food(self, response):
     city = response.meta['city'].strip('/')
     data = Selector(response).xpath(
         '//div[@class="block popular-nav"]/div[@class="block-title"]/text()'
     ).extract()[0].strip()
     data = data.strip(u'共').strip(u'家餐厅')
     # print data
     print '{},{},{}'.format('food', city, data)
예제 #6
0
def get_userItem(body, user_bit):
    """
    :type body: str
    :param body: 用户主页网页
    :return:item: 用户信息item
            concern_link: 所关注用户的链接
            fans_link: 粉丝的链接
    """
    item = items.userItem()

    # 获取ID
    user_id = int(get_userID_from_mainPage(body))
    item['id'] = user_id

    # 获取用户名
    p_name = re.compile(r'\$CONFIG\[\'onick\'\]=\'(.*)\'')
    try:
        name_ = p_name.search(body).group(1)
        item['name'] = name_
    except:
        item['name'] = ''

    # 是否是加V用户
    script_heads = Selector(text=body).xpath('//script/text()').extract()
    head_body = ''
    content_body = ''
    num_body = ''
    relation_body = ''

    for script_head in script_heads:
        p_head = re.compile(r'^\s*FM\.view\(\{(.*)\}\)', re.M)
        p_head_s = p_head.search(
            script_head.replace('\n', '').replace('\t', ''))

        if p_head_s:
            json_head = '{' + p_head_s.group(1) + '}'
            data = json.loads(json_head)
            if data.get('domid'):
                if 'Pl_Official_Headerv6' in data.get(
                        'domid'):  # 从该段中获取是否加V和性别信息
                    head_body = data.get('html')
                elif 'Pl_Core_UserInfo' in data.get(
                        'domid'):  # 从该段中获取加V描述,地址,生日
                    content_body = data.get('html')
                elif 'Pl_Core_T8CustomTriColumn' in data.get('domid'):
                    num_body = data.get('html')
                elif 'Pl_Core_UserGrid' in data.get('domid'):
                    relation_body = data.get('html')
    if head_body:
        v = Selector(text=head_body).xpath(
            "//em[@class='W_icon icon_pf_approve']").extract_first()
        v_co = Selector(text=head_body).xpath(
            "//em[contains(@class, 'icon_pf_approve_co')]").extract_first()
        if v:
            item['is_v'] = 1  # 个人
        elif v_co:
            item['is_v'] = 2  # 机构
        else:
            item['is_v'] = 0

        # 获取性别
        female = Selector(text=head_body).xpath(
            "//i[@class='W_icon icon_pf_female']").extract_first()
        male = Selector(text=head_body).xpath(
            "//i[@class='W_icon icon_pf_male']").extract_first()
        if female:
            item['sex'] = 1  # 女
        elif male:
            item['sex'] = 2  # 男
        else:
            item['sex'] = 0  # 出错或未知

        # 获取加V描述
        v_des = Selector(
            text=head_body).xpath("//div[@class='pf_intro']/text()")
        if v_des:
            item['v_des'] = v_des.extract_first().strip()
        else:
            item['v_des'] = ''
    else:
        item['is_v'] = -1  # 出错
        item['sex'] = 0
        item['v_des'] = ''

    # 获得微博等级
    if content_body:
        level_ = Selector(text=content_body).xpath(
            "//a[contains(@class,'W_icon_level')]/span/text()").extract_first(
            )
        if level_:
            item['level'] = int(level_.strip('Lv. '))
        else:
            item['level'] = -1

        # 获取地址和生日
        item['address'] = ''
        item['birthday'] = ''
        contents = Selector(text=content_body) \
            .xpath("//div[@class='detail']/ul[@class='ul_detail']/li[@class='item S_line2 clearfix']")
        for content in contents:
            icon = content.xpath(
                "./span[contains(@class, 'item_ico W_f')]/em/@class"
            ).extract_first()
            if icon == 'W_ficon ficon_cd_place S_ficon':
                addr = content.xpath(
                    "./span[contains(@class, 'item_text W_f')]/text()"
                ).extract_first().strip()
                item['address'] = addr
            elif icon == 'W_ficon ficon_constellation S_ficon':
                birthday = content.xpath(
                    "./span[contains(@class, 'item_text W_f')]/text()"
                ).extract_first().strip()
                item['birthday'] = birthday
    else:
        item['level'] = -1
        item['address'] = ''
        item['birthday'] = ''

    # 获取关注、粉丝、微博的数量
    concern_link = ''
    fans_link = ''

    if num_body:
        num_infos = Selector(
            text=num_body).xpath("//a[@class='t_link S_txt1']")
        has_link = False
        if num_infos:
            text_xpath = "./span[@class='S_txt2']/text()"
            num_xpath = "./strong[contains(@class, 'W_f')]/text()"
            has_link = True
        else:
            num_infos = Selector(text=num_body).xpath("//td[@class='S_line1']")
            text_xpath = "./span[@class='S_txt2']/text()"
            num_xpath = "./strong[contains(@class,'W_f')]/text()"

        for num_info in num_infos:
            info = num_info.xpath(text_xpath).extract_first()
            num = num_info.xpath(num_xpath).extract_first()
            if num:
                num = int(num.strip())
            else:
                num = -1

            if info == '关注':
                item['concern_num'] = num
                if has_link:
                    concern_link = num_info.xpath("./@href").extract_first()
            elif info == '粉丝':
                item['fans_num'] = num
                if has_link:
                    fans_link = num_info.xpath("./@href").extract_first()
            elif info == '微博':
                item['blog_num'] = num

        # 从微关系块获得关注、粉丝的链接信息
        if not has_link:
            relation_infos = Selector(text=relation_body) \
                .xpath(
                "//div[@class='obj_name']/h2[contains(@class, 'main_title W_fb W_f')]/a[contains(@class, 'S_txt')]")
            for relation in relation_infos:
                link = relation.xpath("./@href").extract_first()
                text = relation.xpath('./text()').extract_first()
                if '关注' in text:
                    concern_link = link
                elif '粉丝' in text:
                    fans_link = link
    else:
        item['concern_num'] = -1  # 未知或出错
        item['fans_num'] = -1
        item['blog_num'] = -1

    return item, concern_link, fans_link
예제 #7
0
    def _populate_from_html(self, response, product):
        """
        Gets data straight from html body
        """
        reqs = response.meta.get('reqs', [])

        # Set product url
        cond_set_value(product, 'url', response.url)
        try:
            if product['no_longer_available']:
                image = response.xpath('//img[@itemprop="image"]/@src')
                image = is_empty(image.extract())
                image = image.replace('//','http://').replace('Large','Enlarge')
                product['image_url'] = image

                brand = response.xpath('//span[@itemprop="brand"]/text()')
                brand = is_empty(brand.extract())
                product['brand'] = brand
        except KeyError:
            pass

        # Get title from html
        title = is_empty(
            response.xpath("//div[@id='product-desc']/"
                           "h1[@data-analytics-type='productPage-productName']").extract(), "")
        if title:
            title = Selector(text=title).xpath('string()').extract()
            product["title"] = is_empty(title, "").strip()

        # Get price
        price = response.xpath('//div[contains(@class, "microdata-price")]/'
                               '*[@itemprop="price"]/text() |'
                               '//div[contains(@class, "microdata-price")]/'
                               '*[@itemprop="lowPrice"]/text()')
        currency = response.xpath('//div[contains(@class, "microdata-price")]/'
                                '*[@itemprop="priceCurrency"]/@content')

        if price and currency:
            currency = is_empty(currency.extract())
            price = is_empty(price.extract())
            price = price.replace('$', '')
            product['price'] = Price(priceCurrency=currency, price=price)
        else:
            product['price'] = None

        # Get description
        desc = response.css(
            '.productDescription [itemprop="description"]'
        )
        if desc:
            description = desc.extract()
            product["description"] = is_empty(description, "").strip()

        # Get department and category
        category_list = response.xpath('//nav[@id="breadcrumb"]/./'
                                       '/a[@data-analytics-type="cat"]'
                                       '/span[@itemprop="title"]/text()').extract()

        if category_list:
            department = category_list[-1]
            product['department'] = department

            product['category'] = category_list

        # Get model
        model_sel = response.xpath(
            '//div[@id="specGroup"]/./'
            '/span[@itemprop="model"]/text()'
        ).extract()
        model = is_empty(
            model_sel, ''
        )

        # Get related products
        related_sel = response.xpath('//section[@aria-label="Featured Products: Related Products"]/./'
                                     '/article[contains(@class, "product")]')
        if related_sel:
            related_products = product.get('related_products', {})
            final_rel_prods = []
            for rel in related_sel:
                title = is_empty(
                    rel.xpath('.//div[@class="title"]/./'
                              '/a/text()').extract(),
                    ''
                )
                url = is_empty(
                    rel.xpath('.//div[@class="title"]/./'
                              '/a/@href').extract(),
                    ''
                )
                if title and url:
                    url_base = 'http://www.' + self.allowed_domains[0]
                    url = urlparse.urljoin(url_base, url)
                    final_rel_prods.append({
                        'title': title.strip(),
                        'url': url
                    })
            related_products['related'] = final_rel_prods

            product['related_products'] = related_products

        if model:
            product['model'] = model.strip()

        if reqs:
            return self.send_next_request(reqs, response)

        return product
예제 #8
0
    def parse(self, response, **item):
        """
        Find links to all series or all versions of the film in different qualities
        :param response: scrapy response
        :param item: current movie or series dict
        """

        select = Selector(response=response,
                          type="xml").xpath('//channel').getall()
        for ch in select:
            ch = Selector(text=ch, type="xml")
            title = ch.xpath("./title/text()").get()
            if item['type_src'] != 'series':
                stream_url = ch.xpath("./stream_url/text()").get()
                if "https://www.youtube.com/" in stream_url:
                    continue
                if item.get('description') is None:
                    item['description'] = ""
                description = ch.xpath("./description/text()").get()
                dict_update = self.get_detailed_values(description)
                year = dict_update.pop("year")
                if year != item.get("year"):
                    item["year"] = year
                item.update(**dict_update)
                yield Request(url=stream_url,
                              method='GET',
                              callback=self.post_parse,
                              cb_kwargs=item)
            else:
                stream_url = {}
                sub_series = ch.xpath("//submenu").getall()
                for sub in sub_series:
                    sub = Selector(text=sub, type="xml")
                    name = sub.xpath('//title/text()').get()
                    stream = sub.xpath('//stream_url/text()').get()
                    if "https" not in stream:
                        self.logger.warning(
                            f"Wrong url: {stream} for {name} in {item['title_ua']}"
                        )
                        stream = stream.replace('//ttps:', '')
                        s = urlparse(stream)
                        stream = f"https://{s.hostname}{s.path}"
                    if "uploadvideo.info" in stream:
                        # now its url does not work
                        self.logger.warning(
                            f"URL {stream} unavailable and temporary disabled")
                        continue
                    stream_url[name] = stream
                description = Selector(
                    response=response,
                    type="xml").xpath('//all_description/text()').get()
                dict_update = self.get_detailed_values(description)
                year = dict_update.pop("year")
                if year != item.get("year"):
                    item["year"] = year
                item.update(**dict_update)
                desc = description.strip().splitlines()[-1]
                if desc is not None and item.get('description',
                                                 '') is not None:
                    if len(item.get('description', '')) < len(desc):
                        item['description'] = desc
                for name, stream in stream_url.items():
                    item['series'].update(title=title)
                    item["name"] = name
                    yield Request(url=stream,
                                  method='GET',
                                  callback=self.post_parse,
                                  cb_kwargs=item)
예제 #9
0
    def parse(self, response):
        player = Player()
        parse = Parser()

        # Need to implement url checker
        url = response.url
        link_checker = "stats.ncaa.org"
        if link_checker not in url:
            print("---------------------------------------------------------------------")
            print("Invalid link: {}, skipping".format(url))
            print("---------------------------------------------------------------------")
        else:
            # Last run didn't pull from 7 teams
            team_name = parse.parse_team_header(response, url)[0]

            player_table_rows = response.xpath(
                '//*[@id="stat_grid"]/tbody/tr').getall()
            for row in player_table_rows:
                p_data = Selector(text=row).css('td').getall()

                p_uuid = uuid.uuid4()
                jersey = Selector(text=p_data[0]).css('td::text').get().strip()
                name = Selector(text=p_data[1]).css('a::text').get().strip()
                year = Selector(text=p_data[2]).css('td::text').get().strip()
                position = Selector(text=p_data[3]).css('td::text').get()
                if position is not None:
                    position.strip()
                games_played = Selector(text=p_data[4]).css(
                    'td::text').get().strip()
                start_time = time.time()
                goals = Selector(text=p_data[8]).css('div::text').get().strip()
                total_time = time.time() - start_time
                print('-----------------------------------------------------------------------')
                print(total_time)
                print('-----------------------------------------------------------------------')
                break
                assists = Selector(text=p_data[9]).css(
                    'div::text').get().strip()
                points = Selector(text=p_data[10]).css(
                    'div::text').get().strip()
                shots = Selector(text=p_data[11]).css(
                    'div::text').get().strip()
                shot_pct = Selector(text=p_data[12]).css(
                    'div::text').get().strip()
                sog = Selector(text=p_data[13]).css('div::text').get().strip()
                sog_pct = Selector(text=p_data[14]).css(
                    'div::text').get().strip()
                groundballs = Selector(text=p_data[18]).css(
                    'div::text').get().strip()
                turnovers = Selector(text=p_data[19]).css(
                    'div::text').get().strip()
                caused_turnovers = Selector(text=p_data[20]).css(
                    'div::text').get().strip()
                faceoff_wins = Selector(text=p_data[21]).css(
                    'div::text').get().strip()
                faceoffs_taken = Selector(text=p_data[22]).css(
                    'div::text').get().strip()
                faceoff_win_pct = Selector(text=p_data[23]).css(
                    'div::text').get().strip()
                penalties = Selector(text=p_data[24]).css(
                    'div::text').get().strip()
                penalty_time = Selector(text=p_data[25]).css(
                    'div::text').get().strip()
                goals_allowed = Selector(text=p_data[29]).css(
                    'div::text').get().strip()
                saves = Selector(text=p_data[31]).css(
                    'div::text').get().strip()
                save_pct = Selector(text=p_data[32]).css(
                    'div::text').get().strip()

                name_split = name.split(', ')
                first_name = name_split[1]
                last_name = name_split[0]

                if goals == "":
                    goals = "0"
                if assists == "":
                    assists = "0"
                if points == "":
                    points = "0"
                if shots == "":
                    shots = "0"
                if shot_pct == "":
                    shot_pct = "0.000"
                if sog == "":
                    sog = "0"
                if sog_pct == "":
                    sog_pct = "0.000"
                if groundballs == "":
                    groundballs = "0"
                if turnovers == "":
                    turnovers = "0"
                if caused_turnovers == "":
                    caused_turnovers = "0"
                if faceoff_wins == "":
                    faceoff_wins = "0"
                if faceoffs_taken == "":
                    faceoffs_taken = "0"
                if faceoff_win_pct == "":
                    faceoff_win_pct = "0.000"
                if penalties == "":
                    penalties = "0"
                if penalty_time == "":
                    penalty_time = "0"
                if goals_allowed == "":
                    goals_allowed = "0"
                if saves == "":
                    saves = "0"
                if save_pct == "":
                    save_pct = "0.000"

                player['uuid'] = str(p_uuid)
                player['team'] = team_name
                player['first_name'] = first_name
                player['last_name'] = last_name
                player['jersey'] = jersey
                player['year'] = year
                player['position'] = position
                player['games_played'] = int(games_played, 10)
                player['goals'] = int(goals, 10)
                player['assists'] = int(assists, 10)
                player['points'] = int(points, 10)
                player['shots'] = int(shots, 10)
                player['shot_pct'] = float(shot_pct)
                player['sog'] = int(sog, 10)
                player['sog_pct'] = float(sog_pct)
                player['groundballs'] = int(groundballs, 10)
                player['turnovers'] = int(turnovers, 10)
                player['caused_turnovers'] = int(caused_turnovers, 10)
                player['faceoff_wins'] = int(faceoff_wins, 10)
                player['faceoffs_taken'] = int(faceoffs_taken, 10)
                player['faceoff_win_pct'] = float(faceoff_win_pct)
                player['penalties'] = int(penalties, 10)
                player['penalty_time'] = int(penalty_time, 10)
                player['games_played'] = int(games_played, 10)
                player['goals_allowed'] = int(goals_allowed, 10)
                player['saves'] = int(saves, 10)
                player['save_pct'] = float(save_pct)

                yield player

            team_name = '\'{}\''.format(team_name)
            print("Scraped {:32} player data @ {}".format(
                team_name, response.url))