def parse_city_life(self, response): city = response.meta['city'].strip('/') # print "city:", city data = Selector(response).xpath( '//div[@class="hd"]/h1/span/a/text()').extract()[0].strip() data = data.strip(u'家') print u'{},{},{}'.format('life', city, data)
def parse_post(self, response): crawl_time = None item = JishuxItem() item['post_url'] = response.meta['item']['post_url'] item['post_title'] = response.meta['item']['post_title'] item['_id'] = response.meta['item']['_id'] item['post_type'] = response.meta['item']['post_type'] conf = response.meta['conf'] # post_time = re.search( # '(20\d{2}([.\-/|年月\s]{1,3}\d{1,2}){2}日?(\s\d{2}:\d{2}(:\d{2})?)?)|(\d{1,2}\s?(分钟|小时|天)前)', # response.text) # if post_time: # crawl_time = generate_timestamp(post_time.group()) content_html = get_summary(response, conf) content_text = Selector(text=content_html).xpath('string(.)').extract_first() content_text = content_text.strip().replace('\r', '').replace('\n', '').replace('\t', '') description = get_description(content_text) keywords = get_keywords(response, content_text) item['content_html'] = content_html item['description'] = description item['keywords'] = keywords item['crawl_time'] = crawl_time if crawl_time else int(time.time()) item['cn_name'] = conf['cn_name'] item['author'] = '' # todo 文章作者 配置文件需要适配 yield item if not start_urls_config.get('debug') else print(item)
def get_text(content): html = get_summary(content) content_text = Selector(text=html).xpath('string(.)').extract_first() content_text = content_text.strip().replace('\r', '').replace('\n', '').replace( '\t', '') return content_text
def extract_title(response): try: title = Selector( text=response.body).xpath('//title/text()').extract()[0] title = title.strip() return title except Exception as e: return ""
def parse_city_food(self, response): city = response.meta['city'].strip('/') data = Selector(response).xpath( '//div[@class="block popular-nav"]/div[@class="block-title"]/text()' ).extract()[0].strip() data = data.strip(u'共').strip(u'家餐厅') # print data print '{},{},{}'.format('food', city, data)
def get_userItem(body, user_bit): """ :type body: str :param body: 用户主页网页 :return:item: 用户信息item concern_link: 所关注用户的链接 fans_link: 粉丝的链接 """ item = items.userItem() # 获取ID user_id = int(get_userID_from_mainPage(body)) item['id'] = user_id # 获取用户名 p_name = re.compile(r'\$CONFIG\[\'onick\'\]=\'(.*)\'') try: name_ = p_name.search(body).group(1) item['name'] = name_ except: item['name'] = '' # 是否是加V用户 script_heads = Selector(text=body).xpath('//script/text()').extract() head_body = '' content_body = '' num_body = '' relation_body = '' for script_head in script_heads: p_head = re.compile(r'^\s*FM\.view\(\{(.*)\}\)', re.M) p_head_s = p_head.search( script_head.replace('\n', '').replace('\t', '')) if p_head_s: json_head = '{' + p_head_s.group(1) + '}' data = json.loads(json_head) if data.get('domid'): if 'Pl_Official_Headerv6' in data.get( 'domid'): # 从该段中获取是否加V和性别信息 head_body = data.get('html') elif 'Pl_Core_UserInfo' in data.get( 'domid'): # 从该段中获取加V描述,地址,生日 content_body = data.get('html') elif 'Pl_Core_T8CustomTriColumn' in data.get('domid'): num_body = data.get('html') elif 'Pl_Core_UserGrid' in data.get('domid'): relation_body = data.get('html') if head_body: v = Selector(text=head_body).xpath( "//em[@class='W_icon icon_pf_approve']").extract_first() v_co = Selector(text=head_body).xpath( "//em[contains(@class, 'icon_pf_approve_co')]").extract_first() if v: item['is_v'] = 1 # 个人 elif v_co: item['is_v'] = 2 # 机构 else: item['is_v'] = 0 # 获取性别 female = Selector(text=head_body).xpath( "//i[@class='W_icon icon_pf_female']").extract_first() male = Selector(text=head_body).xpath( "//i[@class='W_icon icon_pf_male']").extract_first() if female: item['sex'] = 1 # 女 elif male: item['sex'] = 2 # 男 else: item['sex'] = 0 # 出错或未知 # 获取加V描述 v_des = Selector( text=head_body).xpath("//div[@class='pf_intro']/text()") if v_des: item['v_des'] = v_des.extract_first().strip() else: item['v_des'] = '' else: item['is_v'] = -1 # 出错 item['sex'] = 0 item['v_des'] = '' # 获得微博等级 if content_body: level_ = Selector(text=content_body).xpath( "//a[contains(@class,'W_icon_level')]/span/text()").extract_first( ) if level_: item['level'] = int(level_.strip('Lv. ')) else: item['level'] = -1 # 获取地址和生日 item['address'] = '' item['birthday'] = '' contents = Selector(text=content_body) \ .xpath("//div[@class='detail']/ul[@class='ul_detail']/li[@class='item S_line2 clearfix']") for content in contents: icon = content.xpath( "./span[contains(@class, 'item_ico W_f')]/em/@class" ).extract_first() if icon == 'W_ficon ficon_cd_place S_ficon': addr = content.xpath( "./span[contains(@class, 'item_text W_f')]/text()" ).extract_first().strip() item['address'] = addr elif icon == 'W_ficon ficon_constellation S_ficon': birthday = content.xpath( "./span[contains(@class, 'item_text W_f')]/text()" ).extract_first().strip() item['birthday'] = birthday else: item['level'] = -1 item['address'] = '' item['birthday'] = '' # 获取关注、粉丝、微博的数量 concern_link = '' fans_link = '' if num_body: num_infos = Selector( text=num_body).xpath("//a[@class='t_link S_txt1']") has_link = False if num_infos: text_xpath = "./span[@class='S_txt2']/text()" num_xpath = "./strong[contains(@class, 'W_f')]/text()" has_link = True else: num_infos = Selector(text=num_body).xpath("//td[@class='S_line1']") text_xpath = "./span[@class='S_txt2']/text()" num_xpath = "./strong[contains(@class,'W_f')]/text()" for num_info in num_infos: info = num_info.xpath(text_xpath).extract_first() num = num_info.xpath(num_xpath).extract_first() if num: num = int(num.strip()) else: num = -1 if info == '关注': item['concern_num'] = num if has_link: concern_link = num_info.xpath("./@href").extract_first() elif info == '粉丝': item['fans_num'] = num if has_link: fans_link = num_info.xpath("./@href").extract_first() elif info == '微博': item['blog_num'] = num # 从微关系块获得关注、粉丝的链接信息 if not has_link: relation_infos = Selector(text=relation_body) \ .xpath( "//div[@class='obj_name']/h2[contains(@class, 'main_title W_fb W_f')]/a[contains(@class, 'S_txt')]") for relation in relation_infos: link = relation.xpath("./@href").extract_first() text = relation.xpath('./text()').extract_first() if '关注' in text: concern_link = link elif '粉丝' in text: fans_link = link else: item['concern_num'] = -1 # 未知或出错 item['fans_num'] = -1 item['blog_num'] = -1 return item, concern_link, fans_link
def _populate_from_html(self, response, product): """ Gets data straight from html body """ reqs = response.meta.get('reqs', []) # Set product url cond_set_value(product, 'url', response.url) try: if product['no_longer_available']: image = response.xpath('//img[@itemprop="image"]/@src') image = is_empty(image.extract()) image = image.replace('//','http://').replace('Large','Enlarge') product['image_url'] = image brand = response.xpath('//span[@itemprop="brand"]/text()') brand = is_empty(brand.extract()) product['brand'] = brand except KeyError: pass # Get title from html title = is_empty( response.xpath("//div[@id='product-desc']/" "h1[@data-analytics-type='productPage-productName']").extract(), "") if title: title = Selector(text=title).xpath('string()').extract() product["title"] = is_empty(title, "").strip() # Get price price = response.xpath('//div[contains(@class, "microdata-price")]/' '*[@itemprop="price"]/text() |' '//div[contains(@class, "microdata-price")]/' '*[@itemprop="lowPrice"]/text()') currency = response.xpath('//div[contains(@class, "microdata-price")]/' '*[@itemprop="priceCurrency"]/@content') if price and currency: currency = is_empty(currency.extract()) price = is_empty(price.extract()) price = price.replace('$', '') product['price'] = Price(priceCurrency=currency, price=price) else: product['price'] = None # Get description desc = response.css( '.productDescription [itemprop="description"]' ) if desc: description = desc.extract() product["description"] = is_empty(description, "").strip() # Get department and category category_list = response.xpath('//nav[@id="breadcrumb"]/./' '/a[@data-analytics-type="cat"]' '/span[@itemprop="title"]/text()').extract() if category_list: department = category_list[-1] product['department'] = department product['category'] = category_list # Get model model_sel = response.xpath( '//div[@id="specGroup"]/./' '/span[@itemprop="model"]/text()' ).extract() model = is_empty( model_sel, '' ) # Get related products related_sel = response.xpath('//section[@aria-label="Featured Products: Related Products"]/./' '/article[contains(@class, "product")]') if related_sel: related_products = product.get('related_products', {}) final_rel_prods = [] for rel in related_sel: title = is_empty( rel.xpath('.//div[@class="title"]/./' '/a/text()').extract(), '' ) url = is_empty( rel.xpath('.//div[@class="title"]/./' '/a/@href').extract(), '' ) if title and url: url_base = 'http://www.' + self.allowed_domains[0] url = urlparse.urljoin(url_base, url) final_rel_prods.append({ 'title': title.strip(), 'url': url }) related_products['related'] = final_rel_prods product['related_products'] = related_products if model: product['model'] = model.strip() if reqs: return self.send_next_request(reqs, response) return product
def parse(self, response, **item): """ Find links to all series or all versions of the film in different qualities :param response: scrapy response :param item: current movie or series dict """ select = Selector(response=response, type="xml").xpath('//channel').getall() for ch in select: ch = Selector(text=ch, type="xml") title = ch.xpath("./title/text()").get() if item['type_src'] != 'series': stream_url = ch.xpath("./stream_url/text()").get() if "https://www.youtube.com/" in stream_url: continue if item.get('description') is None: item['description'] = "" description = ch.xpath("./description/text()").get() dict_update = self.get_detailed_values(description) year = dict_update.pop("year") if year != item.get("year"): item["year"] = year item.update(**dict_update) yield Request(url=stream_url, method='GET', callback=self.post_parse, cb_kwargs=item) else: stream_url = {} sub_series = ch.xpath("//submenu").getall() for sub in sub_series: sub = Selector(text=sub, type="xml") name = sub.xpath('//title/text()').get() stream = sub.xpath('//stream_url/text()').get() if "https" not in stream: self.logger.warning( f"Wrong url: {stream} for {name} in {item['title_ua']}" ) stream = stream.replace('//ttps:', '') s = urlparse(stream) stream = f"https://{s.hostname}{s.path}" if "uploadvideo.info" in stream: # now its url does not work self.logger.warning( f"URL {stream} unavailable and temporary disabled") continue stream_url[name] = stream description = Selector( response=response, type="xml").xpath('//all_description/text()').get() dict_update = self.get_detailed_values(description) year = dict_update.pop("year") if year != item.get("year"): item["year"] = year item.update(**dict_update) desc = description.strip().splitlines()[-1] if desc is not None and item.get('description', '') is not None: if len(item.get('description', '')) < len(desc): item['description'] = desc for name, stream in stream_url.items(): item['series'].update(title=title) item["name"] = name yield Request(url=stream, method='GET', callback=self.post_parse, cb_kwargs=item)
def parse(self, response): player = Player() parse = Parser() # Need to implement url checker url = response.url link_checker = "stats.ncaa.org" if link_checker not in url: print("---------------------------------------------------------------------") print("Invalid link: {}, skipping".format(url)) print("---------------------------------------------------------------------") else: # Last run didn't pull from 7 teams team_name = parse.parse_team_header(response, url)[0] player_table_rows = response.xpath( '//*[@id="stat_grid"]/tbody/tr').getall() for row in player_table_rows: p_data = Selector(text=row).css('td').getall() p_uuid = uuid.uuid4() jersey = Selector(text=p_data[0]).css('td::text').get().strip() name = Selector(text=p_data[1]).css('a::text').get().strip() year = Selector(text=p_data[2]).css('td::text').get().strip() position = Selector(text=p_data[3]).css('td::text').get() if position is not None: position.strip() games_played = Selector(text=p_data[4]).css( 'td::text').get().strip() start_time = time.time() goals = Selector(text=p_data[8]).css('div::text').get().strip() total_time = time.time() - start_time print('-----------------------------------------------------------------------') print(total_time) print('-----------------------------------------------------------------------') break assists = Selector(text=p_data[9]).css( 'div::text').get().strip() points = Selector(text=p_data[10]).css( 'div::text').get().strip() shots = Selector(text=p_data[11]).css( 'div::text').get().strip() shot_pct = Selector(text=p_data[12]).css( 'div::text').get().strip() sog = Selector(text=p_data[13]).css('div::text').get().strip() sog_pct = Selector(text=p_data[14]).css( 'div::text').get().strip() groundballs = Selector(text=p_data[18]).css( 'div::text').get().strip() turnovers = Selector(text=p_data[19]).css( 'div::text').get().strip() caused_turnovers = Selector(text=p_data[20]).css( 'div::text').get().strip() faceoff_wins = Selector(text=p_data[21]).css( 'div::text').get().strip() faceoffs_taken = Selector(text=p_data[22]).css( 'div::text').get().strip() faceoff_win_pct = Selector(text=p_data[23]).css( 'div::text').get().strip() penalties = Selector(text=p_data[24]).css( 'div::text').get().strip() penalty_time = Selector(text=p_data[25]).css( 'div::text').get().strip() goals_allowed = Selector(text=p_data[29]).css( 'div::text').get().strip() saves = Selector(text=p_data[31]).css( 'div::text').get().strip() save_pct = Selector(text=p_data[32]).css( 'div::text').get().strip() name_split = name.split(', ') first_name = name_split[1] last_name = name_split[0] if goals == "": goals = "0" if assists == "": assists = "0" if points == "": points = "0" if shots == "": shots = "0" if shot_pct == "": shot_pct = "0.000" if sog == "": sog = "0" if sog_pct == "": sog_pct = "0.000" if groundballs == "": groundballs = "0" if turnovers == "": turnovers = "0" if caused_turnovers == "": caused_turnovers = "0" if faceoff_wins == "": faceoff_wins = "0" if faceoffs_taken == "": faceoffs_taken = "0" if faceoff_win_pct == "": faceoff_win_pct = "0.000" if penalties == "": penalties = "0" if penalty_time == "": penalty_time = "0" if goals_allowed == "": goals_allowed = "0" if saves == "": saves = "0" if save_pct == "": save_pct = "0.000" player['uuid'] = str(p_uuid) player['team'] = team_name player['first_name'] = first_name player['last_name'] = last_name player['jersey'] = jersey player['year'] = year player['position'] = position player['games_played'] = int(games_played, 10) player['goals'] = int(goals, 10) player['assists'] = int(assists, 10) player['points'] = int(points, 10) player['shots'] = int(shots, 10) player['shot_pct'] = float(shot_pct) player['sog'] = int(sog, 10) player['sog_pct'] = float(sog_pct) player['groundballs'] = int(groundballs, 10) player['turnovers'] = int(turnovers, 10) player['caused_turnovers'] = int(caused_turnovers, 10) player['faceoff_wins'] = int(faceoff_wins, 10) player['faceoffs_taken'] = int(faceoffs_taken, 10) player['faceoff_win_pct'] = float(faceoff_win_pct) player['penalties'] = int(penalties, 10) player['penalty_time'] = int(penalty_time, 10) player['games_played'] = int(games_played, 10) player['goals_allowed'] = int(goals_allowed, 10) player['saves'] = int(saves, 10) player['save_pct'] = float(save_pct) yield player team_name = '\'{}\''.format(team_name) print("Scraped {:32} player data @ {}".format( team_name, response.url))