def enrich_data(self, item_loader, response): self.logger.debug("Start to enrich_data. ") item_loader.add_xpath("name", '//h2/span[@class="name"]/text()') item_loader.add_value("id", response.url, re=r"song/(\d+)") item_loader.add_re("id", r"source_id: '(\d+)'") item_loader.add_xpath("singer", '//ul/li/span[@class="author_list"]/@title') item_loader.add_xpath("album", '//ul/li[contains(text(), "专辑")]/a/text()') item_loader.add_xpath("tags", '//ul/li[@class="clearfix tag"]/a/text()') node_list = list() # 添加抓取歌词的请求 lyrics_url = "".join( response.xpath( '//div[@id="lyricCont"]/@data-lrclink').extract()).strip() if lyrics_url: node_list.append(("lyrics", item_loader, {"url": lyrics_url})) node_list.append(("source_url", item_loader, { "url": "http://tingapi.ting.baidu.com/v1/restserver/" "ting?method=baidu.ting.song.play&format=jsonp" "&callback=jQuery_%s&songid=%s&_=%s" % (int(time.time() * 1000), (re_search(r"song/(\d+)", response.url) or re_search(r"source_id: '(\d+)'", response.body)), int(time.time() * 1000)) })) return node_list
def cur_to_requests(curl_cmd, filename): tmpl = """import requests def main(): url = "{}" headers = {} form = {} json = {} resp = requests.{}(url, headers=headers, json=json, data=form) print(resp.text) main() """ url = re.search(r"'(http.*?)'", curl_cmd).group(1) headers = json.dumps( dict(tuple(v.strip() for v in header.split(":", 1)) for header in re.findall(r"-H '(.*?)'", curl_cmd)), indent=4) form = re_search(r"--data ?'(.*?)'", curl_cmd, default=None) json_data = re_search(r"--data-binary \$'(.*?)'", curl_cmd, default=None) if form: form = json.dumps(dict(tuple(param.split("=", 1)) for param in form.replace("+", " ").split("&")), indent=4) if json_data: json_data = re.sub(r"\\u\w{4}", repl, json.dumps(json.loads(json_data), indent=4).replace("false", "False").replace("true", "True").replace("null", "None")) with open(filename, "w") as f: f.write(tmpl.format( url, headers, form, json_data, "post" if form or json_data else "get"))
def cur_to_requests(curl_cmd, filename): tmpl = """import requests def main(): url = "{}" headers = {} form = {} json = {} resp = requests.{}(url, headers=headers, json=json, data=form) print(resp.text) main() """ url = re.search(r"'(http.*?)'", curl_cmd).group(1) headers = json.dumps(dict( tuple(v.strip() for v in header.split(":", 1)) for header in re.findall(r"-H '(.*?)'", curl_cmd)), indent=10) form = re_search(r"--data '(.*?)'", curl_cmd) json_data = re_search(r"--data-binary '(.*?)'", curl_cmd, None) if form: data = json.dumps(dict( tuple(param.split("=", 1)) for param in form.replace("+", " ").split("&")), indent=10) else: data = None if json_data: json_data = json.dumps(json.loads(json_data), indent=10) with open(filename, "w") as f: f.write( tmpl.format(url, headers, data, json_data, "post" if data or json_data else "get"))
def page_url(self, response): next_page_url = "http://music.baidu.com/data/user/getsongs" query = urldecode(urlparse(response.url).query) query.setdefault("ting_uid", re_search(r"artist/(\d+)", response.url)) query.setdefault("hotmax", re_search(r"var hotbarMax = (\d+)", response.body)) query["order"] = "hot" query[".r"] = str(random.random()) + str(int(time.time() * 1000)) query.setdefault("pay", "") return "%s?%s" % (next_page_url, urlencode(query))
def enrich_comments(self, item_loader, response): self.logger.debug("Start to enrich_comments. ") comments = response.xpath( '//div[@id="comments"]/div[@class="comment-item"]') comment_list = list() for comment_div in comments: comment = dict() comment["author"] = xpath_exchange( comment_div.xpath('div/a/@title')) comment["upvotes"] = xpath_exchange( comment_div.xpath('div/h3/span/span[@class="votes"]/text()')) comment["score"] = int( re_search( r"allstar(\d+)", xpath_exchange( comment_div.xpath( 'div/h3/span[@class="comment-info"]/span[contains(@class, "rating")]/@class' ))) or 0) / 5 comment["datetime"] = xpath_exchange( comment_div.xpath( 'div/h3/span/span[@class="comment-time "]/@title')) comment["content"] = xpath_exchange( comment_div.xpath('div/p/text()')) comment_list.append(comment) item_loader.add_value("comments", comment_list) next_url = xpath_exchange( response.xpath('//div[@id="paginator"]/a[@class="next"]/@href')) if next_url: return [("comments", item_loader, { "url": response.urljoin(next_url) })]
def is_anonymous(self, proxy): url = "http://www.98bk.com/cycx/ip1/" resp = requests.get(url, timeout=10, headers=self.headers, proxies={"http": "http://%s" % proxy}) buf = resp.text.encode("iso-8859-1").decode("gbk") real_ip = re_search(r"您的真实IP是([\d\.]+)", buf) self.logger.info(f"My ip :{self.my_ip}, Real ip: {real_ip}") return real_ip == "" or not self.my_ip.count(real_ip)
def download(self, url, name): resp = self.downloader.get(url, stream=True) filename = unquote( re_search(r'filename="(.*?)"(?:;|$)', resp.headers.get("Content-Disposition", ""))) or name with open(filename, "wb") as f: for chunk in self.downloader.get( url, stream=True).iter_content(chunk_size=1024): f.write(chunk)
def enrich_related_pics(self, item_loader, response): self.logger.debug("Start to enrich_related_pics. ") types = response.xpath('//div[@class="article"]/div[@class="mod"]') related_pics = dict() for type in types: related_pics[re_search(r"(\w+)", xpath_exchange(type.xpath('div[@class="hd"]/h2/text()')))] = \ type.xpath('div[@class="bd"]/ul/li/a/img/@src').extract() item_loader.add_value("related_pics", related_pics)
def enrich_celebrities(self, item_loader, response): self.logger.debug("Start to enrich_celebrities. ") positions = response.xpath( '//div[@id="celebrities"]/div[@class="list-wrapper"]') celebrity_list = list() for position in positions: pos = xpath_exchange(position.xpath("h2/text()")) for position_actor in position.xpath('ul/li'): celebrity = dict() role = xpath_exchange( position_actor.xpath('div/span[@class="role"]/text()')) if role: celebrity["role"] = re_search(r"饰 (.*)", role) celebrity["position"] = pos celebrity["name"] = xpath_exchange( position_actor.xpath('div/span[@class="name"]/a/text()')) celebrity["representative"] = position_actor.xpath( 'div/span[@class="works"]/a/text()').extract() celebrity_list.append(celebrity) item_loader.add_value("celebrities", celebrity_list)
def download(self, url, name, proxy): if not (os.path.exists(name) and os.path.getsize(name) > 100000): try: try: resp = self.downloader.get( url, headers=headers, stream=True, timeout=20, ) except Exception: resp = self.downloader.get(url, headers=headers, stream=True, timeout=20, proxies={ "http": proxy, "https": proxy }) with open(name, "wb") as f: have_recv = 0 total = int( re_search("(\d+)", resp.headers.get("Content-Length"), default=0)) if total: for chunk in resp.iter_content(chunk_size=1024000): have_recv += len(chunk) self.logger.debug( "Got %s from: %s, speed: %s" % (name, url, round(have_recv / total, 2))) f.write(chunk) else: self.logger.error("Have got any data from %s. " % url) finally: try: if not os.path.getsize(name): os.unlink(name) except: pass