def enrich_data(self, item_loader, response):
        self.logger.debug("Start to enrich_data. ")
        item_loader.add_xpath("name", '//h2/span[@class="name"]/text()')
        item_loader.add_value("id", response.url, re=r"song/(\d+)")
        item_loader.add_re("id", r"source_id: '(\d+)'")
        item_loader.add_xpath("singer",
                              '//ul/li/span[@class="author_list"]/@title')
        item_loader.add_xpath("album",
                              '//ul/li[contains(text(), "专辑")]/a/text()')
        item_loader.add_xpath("tags",
                              '//ul/li[@class="clearfix tag"]/a/text()')
        node_list = list()
        # 添加抓取歌词的请求
        lyrics_url = "".join(
            response.xpath(
                '//div[@id="lyricCont"]/@data-lrclink').extract()).strip()
        if lyrics_url:
            node_list.append(("lyrics", item_loader, {"url": lyrics_url}))

        node_list.append(("source_url", item_loader, {
            "url":
            "http://tingapi.ting.baidu.com/v1/restserver/"
            "ting?method=baidu.ting.song.play&format=jsonp"
            "&callback=jQuery_%s&songid=%s&_=%s" %
            (int(time.time() * 1000),
             (re_search(r"song/(\d+)", response.url)
              or re_search(r"source_id: '(\d+)'", response.body)),
             int(time.time() * 1000))
        }))
        return node_list
Пример #2
0
def cur_to_requests(curl_cmd, filename):
    tmpl = """import requests


def main():
    url = "{}"
    headers = {}
    form = {}
    json = {}
    resp = requests.{}(url, headers=headers, json=json, data=form)
    print(resp.text)


main()
"""
    url = re.search(r"'(http.*?)'", curl_cmd).group(1)
    headers = json.dumps(
        dict(tuple(v.strip() for v in header.split(":", 1)) for header in re.findall(r"-H '(.*?)'", curl_cmd)), indent=4)
    form = re_search(r"--data ?'(.*?)'", curl_cmd, default=None)

    json_data = re_search(r"--data-binary \$'(.*?)'", curl_cmd, default=None)
    if form:
        form = json.dumps(dict(tuple(param.split("=", 1)) for param in form.replace("+", " ").split("&")), indent=4)
    if json_data:
        json_data = re.sub(r"\\u\w{4}", repl, json.dumps(json.loads(json_data), indent=4).replace("false", "False").replace("true", "True").replace("null", "None"))

    with open(filename, "w") as f:
        f.write(tmpl.format(
            url,
            headers,
            form,
            json_data,
            "post" if form or json_data else "get"))
Пример #3
0
def cur_to_requests(curl_cmd, filename):
    tmpl = """import requests


def main():
    url = "{}"
    headers = {}
    form = {}
    json = {}
    resp = requests.{}(url, headers=headers, json=json, data=form)
    print(resp.text)


main()
"""
    url = re.search(r"'(http.*?)'", curl_cmd).group(1)
    headers = json.dumps(dict(
        tuple(v.strip() for v in header.split(":", 1))
        for header in re.findall(r"-H '(.*?)'", curl_cmd)),
                         indent=10)
    form = re_search(r"--data '(.*?)'", curl_cmd)
    json_data = re_search(r"--data-binary '(.*?)'", curl_cmd, None)
    if form:
        data = json.dumps(dict(
            tuple(param.split("=", 1))
            for param in form.replace("+", " ").split("&")),
                          indent=10)
    else:
        data = None
    if json_data:
        json_data = json.dumps(json.loads(json_data), indent=10)
    with open(filename, "w") as f:
        f.write(
            tmpl.format(url, headers, data, json_data,
                        "post" if data or json_data else "get"))
 def page_url(self, response):
     next_page_url = "http://music.baidu.com/data/user/getsongs"
     query = urldecode(urlparse(response.url).query)
     query.setdefault("ting_uid", re_search(r"artist/(\d+)", response.url))
     query.setdefault("hotmax",
                      re_search(r"var hotbarMax = (\d+)", response.body))
     query["order"] = "hot"
     query[".r"] = str(random.random()) + str(int(time.time() * 1000))
     query.setdefault("pay", "")
     return "%s?%s" % (next_page_url, urlencode(query))
Пример #5
0
    def enrich_comments(self, item_loader, response):
        self.logger.debug("Start to enrich_comments. ")
        comments = response.xpath(
            '//div[@id="comments"]/div[@class="comment-item"]')
        comment_list = list()
        for comment_div in comments:
            comment = dict()
            comment["author"] = xpath_exchange(
                comment_div.xpath('div/a/@title'))
            comment["upvotes"] = xpath_exchange(
                comment_div.xpath('div/h3/span/span[@class="votes"]/text()'))

            comment["score"] = int(
                re_search(
                    r"allstar(\d+)",
                    xpath_exchange(
                        comment_div.xpath(
                            'div/h3/span[@class="comment-info"]/span[contains(@class, "rating")]/@class'
                        ))) or 0) / 5
            comment["datetime"] = xpath_exchange(
                comment_div.xpath(
                    'div/h3/span/span[@class="comment-time "]/@title'))
            comment["content"] = xpath_exchange(
                comment_div.xpath('div/p/text()'))
            comment_list.append(comment)
        item_loader.add_value("comments", comment_list)
        next_url = xpath_exchange(
            response.xpath('//div[@id="paginator"]/a[@class="next"]/@href'))
        if next_url:
            return [("comments", item_loader, {
                "url": response.urljoin(next_url)
            })]
Пример #6
0
 def is_anonymous(self, proxy):
     url = "http://www.98bk.com/cycx/ip1/"
     resp = requests.get(url, timeout=10, headers=self.headers,
                         proxies={"http": "http://%s" % proxy})
     buf = resp.text.encode("iso-8859-1").decode("gbk")
     real_ip = re_search(r"您的真实IP是([\d\.]+)", buf)
     self.logger.info(f"My ip :{self.my_ip}, Real ip: {real_ip}")
     return real_ip == "" or not self.my_ip.count(real_ip)
Пример #7
0
 def download(self, url, name):
     resp = self.downloader.get(url, stream=True)
     filename = unquote(
         re_search(r'filename="(.*?)"(?:;|$)',
                   resp.headers.get("Content-Disposition", ""))) or name
     with open(filename, "wb") as f:
         for chunk in self.downloader.get(
                 url, stream=True).iter_content(chunk_size=1024):
             f.write(chunk)
Пример #8
0
    def enrich_related_pics(self, item_loader, response):
        self.logger.debug("Start to enrich_related_pics. ")
        types = response.xpath('//div[@class="article"]/div[@class="mod"]')
        related_pics = dict()
        for type in types:
            related_pics[re_search(r"(\w+)", xpath_exchange(type.xpath('div[@class="hd"]/h2/text()')))] = \
                type.xpath('div[@class="bd"]/ul/li/a/img/@src').extract()

        item_loader.add_value("related_pics", related_pics)
Пример #9
0
    def enrich_celebrities(self, item_loader, response):
        self.logger.debug("Start to enrich_celebrities. ")
        positions = response.xpath(
            '//div[@id="celebrities"]/div[@class="list-wrapper"]')
        celebrity_list = list()

        for position in positions:
            pos = xpath_exchange(position.xpath("h2/text()"))
            for position_actor in position.xpath('ul/li'):
                celebrity = dict()
                role = xpath_exchange(
                    position_actor.xpath('div/span[@class="role"]/text()'))
                if role:
                    celebrity["role"] = re_search(r"饰 (.*)", role)
                celebrity["position"] = pos
                celebrity["name"] = xpath_exchange(
                    position_actor.xpath('div/span[@class="name"]/a/text()'))
                celebrity["representative"] = position_actor.xpath(
                    'div/span[@class="works"]/a/text()').extract()
                celebrity_list.append(celebrity)
        item_loader.add_value("celebrities", celebrity_list)
Пример #10
0
 def download(self, url, name, proxy):
     if not (os.path.exists(name) and os.path.getsize(name) > 100000):
         try:
             try:
                 resp = self.downloader.get(
                     url,
                     headers=headers,
                     stream=True,
                     timeout=20,
                 )
             except Exception:
                 resp = self.downloader.get(url,
                                            headers=headers,
                                            stream=True,
                                            timeout=20,
                                            proxies={
                                                "http": proxy,
                                                "https": proxy
                                            })
             with open(name, "wb") as f:
                 have_recv = 0
                 total = int(
                     re_search("(\d+)",
                               resp.headers.get("Content-Length"),
                               default=0))
                 if total:
                     for chunk in resp.iter_content(chunk_size=1024000):
                         have_recv += len(chunk)
                         self.logger.debug(
                             "Got %s from: %s, speed: %s" %
                             (name, url, round(have_recv / total, 2)))
                         f.write(chunk)
                 else:
                     self.logger.error("Have got any data from %s. " % url)
         finally:
             try:
                 if not os.path.getsize(name):
                     os.unlink(name)
             except:
                 pass