Exemplo n.º 1
0
def get_one_page_blog(account_id, page_count):
    # http://moexia.lofter.com/?page=1
    blog_pagination_url = "http://blog.sina.com.cn/s/articlelist_%s_0_%s.html" % (account_id, page_count)
    blog_pagination_response = net.http_request(blog_pagination_url, method="GET")
    result = {
        "blog_info_list": [],  # 全部日志地址
        "is_over": False,  # 是不是最后一页
    }
    if blog_pagination_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        if page_count == 1 and blog_pagination_response.data.find("抱歉,您要访问的页面不存在或被删除!") >= 0:
            raise crawler.CrawlerException("账号不存在")
        article_list_selector = PQ(blog_pagination_response.data.decode("UTF-8")).find(".articleList .articleCell")
        if article_list_selector.size() == 0:
            raise crawler.CrawlerException("页面截取日志列表失败\n%s" % blog_pagination_response.data)
        for article_index in range(article_list_selector.size()):
            result_blog_info = {
                "blog_url": None,  # 日志地址
                "blog_time": None,  # 日志时间
                "blog_title": "",  # 日志标题
            }
            article_selector = article_list_selector.eq(article_index)
            # 获取日志地址
            blog_url = article_selector.find("span.atc_title a").attr("href")
            if not blog_url:
                raise crawler.CrawlerException("日志列表解析日志地址失败\n%s" % article_selector.html().encode("UTF-8"))
            result_blog_info["blog_url"] = str(blog_url)
            # 获取日志标题
            blog_title = article_selector.find("span.atc_title a").text().encode("UTF-8")
            if not blog_title:
                raise crawler.CrawlerException("日志列表解析日志标题失败\n%s" % article_selector.html().encode("UTF-8"))
            result_blog_info["blog_title"] = str(blog_title)
            # 获取日志时间
            blog_time = article_selector.find("span.atc_tm").text()
            if not blog_time:
                raise crawler.CrawlerException("日志列表解析日志时间失败\n%s" % article_selector.html().encode("UTF-8"))
            try:
                result_blog_info["blog_time"] = int(time.mktime(time.strptime(blog_time, "%Y-%m-%d %H:%M")))
            except ValueError:
                raise crawler.CrawlerException("日志时间格式不正确\n%s" % blog_time)
            result["blog_info_list"].append(result_blog_info)
        # 获取分页信息
        pagination_html = tool.find_sub_string(blog_pagination_response.data, '<div class="SG_page">', '</div>')
        if not pagination_html:
            result["is_over"] = True
        else:
            max_page_count = tool.find_sub_string(pagination_html, "共", "页")
            if not crawler.is_integer(max_page_count):
                raise crawler.CrawlerException("分页信息截取总页数失败\n%s" % pagination_html)
            result["is_over"] = page_count >= int(max_page_count)
    else:
        raise crawler.CrawlerException(crawler.request_failre(blog_pagination_response.status))
    return result
Exemplo n.º 2
0
def get_one_page_album(account_id, page_count):
    # http://bcy.net/u/50220/post/cos?&p=1
    album_pagination_url = "http://bcy.net/u/%s/post/cos" % account_id
    query_data = {"p": page_count}
    album_pagination_response = net.http_request(album_pagination_url, method="GET", fields=query_data)
    result = {
        "album_info_list": [],  # 全部作品信息
        "coser_id": None,  # coser id
        "is_over": False,  # 是不是最后一页作品
    }
    if album_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(crawler.request_failre(album_pagination_response.status))
    if page_count == 1 and album_pagination_response.data.find("<h2>用户不存在</h2>") >= 0:
        raise crawler.CrawlerException("账号不存在")
    # 获取coser id
    coser_id_find = re.findall('<a href="/coser/detail/([\d]+)/\$\{post.rp_id\}', album_pagination_response.data)
    if len(coser_id_find) != 1:
        raise crawler.CrawlerException("页面截取coser id失败\n%s" % album_pagination_response.data)
    if not crawler.is_integer(coser_id_find[0]):
        raise crawler.CrawlerException("页面截取coser id类型不正确\n%s" % album_pagination_response.data)
    result["coser_id"] = coser_id_find[0]
    # 获取作品信息
    album_list_selector = PQ(album_pagination_response.data.decode("UTF-8")).find("ul.l-grid__inner li.l-grid__item")
    for album_index in range(0, album_list_selector.size()):
        album_selector = album_list_selector.eq(album_index)
        result_album_info = {
            "album_id": None,  # 作品id
            "album_title": None,  # 作品标题
        }
        # 获取作品id
        album_url = album_selector.find(".postWorkCard__img a.postWorkCard__link").attr("href")
        if not album_url:
            raise crawler.CrawlerException("作品信息截取作品地址失败\n%s" % album_selector.html().encode("UTF-8"))
        album_id = str(album_url).split("/")[-1]
        if not crawler.is_integer(album_id):
            raise crawler.CrawlerException("作品地址 %s 截取作品id失败\n%s" % (album_url, album_selector.html().encode("UTF-8")))
        result_album_info['album_id'] = album_id
        # 获取作品标题
        album_title = album_selector.find(".postWorkCard__img img").attr("alt")
        result_album_info["album_title"] = str(album_title.encode("UTF-8"))
        result["album_info_list"].append(result_album_info)
    # 判断是不是最后一页
    last_pagination_selector = PQ(album_pagination_response.data).find("#js-showPagination ul.pager li:last a")
    if last_pagination_selector.size() == 1:
        max_page_count = int(last_pagination_selector.attr("href").strip().split("&p=")[-1])
        result["is_over"] = page_count >= max_page_count
    else:
        result["is_over"] = True
    return result
Exemplo n.º 3
0
def get_market_game_trade_card_price(game_id, login_cookie):
    cookies_list = {"steamLogin": login_cookie}
    market_search_url = "http://steamcommunity.com/market/search/render/"
    market_search_url += "?query=&count=20&appid=753&category_753_Game[0]=tag_app_%s&category_753_cardborder[0]=tag_cardborder_0" % game_id
    market_search_response = net.http_request(market_search_url,
                                              method="GET",
                                              cookies_list=cookies_list,
                                              json_decode=True)
    if market_search_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(market_search_response.status))
    market_item_list = {}
    if not crawler.check_sub_key(
        ("success", "results_html"), market_search_response.json_data):
        raise crawler.CrawlerException(
            "返回信息'success'或'results_html'字段不存在\n%s" %
            market_search_response.json_data)
    if market_search_response.json_data["success"] is not True:
        raise crawler.CrawlerException("返回信息'success'字段取值不正确\n%s" %
                                       market_search_response.json_data)
    card_selector = PQ(market_search_response.json_data["results_html"]).find(
        ".market_listing_row_link")
    for index in range(0, card_selector.size()):
        card_name = card_selector.eq(index).find(
            ".market_listing_item_name").text()
        card_min_price = card_selector.eq(index).find(
            "span.normal_price span.normal_price").text().encode(
                "UTF-8").replace("¥ ", "")
        market_item_list[card_name] = card_min_price
    # {'Pamu': '1.77', 'Fumi (Trading Card)': '2.14', 'Mio (Trading Card)': '1.33', 'Bonnibel (Trading Card)': '1.49', 'Groupshot': '1.87', 'Q-Piddy': '1.35', 'Elle (Trading Card)': '1.19', 'Quill': '1.50', 'Iro (Trading Card)': '1.42', 'Bearverly (Trading Card)': '1.27', 'Cassie (Trading Card)': '1.35'}
    return market_item_list
Exemplo n.º 4
0
def get_self_account_badges(account_id, login_cookie):
    # 徽章第一页
    badges_index_url = "http://steamcommunity.com/profiles/%s/badges/" % account_id
    cookies_list = {"steamLogin": login_cookie}
    badges_index_response = net.http_request(badges_index_url,
                                             method="GET",
                                             cookies_list=cookies_list)
    if badges_index_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(badges_index_response.status))
    badges_detail_url_list = []
    # 徽章div
    badges_selector = PQ(badges_index_response.data).find(
        ".maincontent .badges_sheet .badge_row")
    for index in range(0, badges_selector.size()):
        badge_html = badges_selector.eq(index).html().encode("UTF-8")
        # 已经掉落全部卡牌的徽章
        if badge_html.find("无剩余卡牌掉落") >= 0:
            # 徽章详细信息页面地址
            badge_detail_url = tool.find_sub_string(
                badge_html, '<a class="badge_row_overlay" href="', '"/>')
            if not badge_detail_url:
                raise crawler.CrawlerException("徽章信息截取徽章详细界面地址失败\n%s" %
                                               badge_html)
            badges_detail_url_list.append(badge_detail_url)
    # ['http://steamcommunity.com/profiles/76561198172925593/gamecards/459820/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/357200/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/502740/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/359600/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/354380/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/359670/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/525300/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/337980/', 'http://steamcommunity.com/profiles/76561198172925593/gamecards/591420/']
    return badges_detail_url_list
Exemplo n.º 5
0
def get_one_page_photo(page_count):
    photo_pagination_url = "http://jigadori.fkoji.com/"
    query_data = {"p": page_count}
    photo_pagination_response = net.http_request(photo_pagination_url, method="GET", fields=query_data)
    result = {
        "image_info_list": [],  # 全部图片信息
    }
    if photo_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(crawler.request_failre(photo_pagination_response.status))
    photo_list_selector = PQ(photo_pagination_response.data.decode("UTF-8")).find("#wrapper .row .photo")
    for photo_index in range(0, photo_list_selector.size()):
        photo_selector = photo_list_selector.eq(photo_index)
        photo_selector_html = photo_selector.html().encode("UTF-8")
        result_photo_info = {
            "account_name": "",  # twitter账号
            "image_url_list": [],  # 图片地址
            "tweet_id": None,  # tweet id
            "tweet_time": None,  # tweet发布时间
        }
        # 获取tweet id
        tweet_url = photo_selector.find(".photo-link-outer a").eq(0).attr("href")
        if not tweet_url:
            raise crawler.CrawlerException("图片信息截取tweet地址失败\n%s" % photo_selector_html)
        tweet_id = tool.find_sub_string(tweet_url.strip(), "status/")
        if not crawler.is_integer(tweet_id):
            raise crawler.CrawlerException("tweet地址截取tweet id失败\n%s" % tweet_url)
        result_photo_info["tweet_id"] = int(tweet_id)
        # 获取twitter账号
        account_name = photo_selector.find(".user-info .user-name .screen-name").text()
        if not account_name:
            raise crawler.CrawlerException("图片信息截取twitter账号失败\n%s" % photo_selector_html)
        result_photo_info["account_name"] = str(account_name).strip().replace("@", "")
        # 获取tweet发布时间
        tweet_time = photo_selector.find(".tweet-text .tweet-created-at").text().strip()
        if not tweet_time:
            raise crawler.CrawlerException("图片信息截取tweet发布时间失败\n%s" % photo_selector_html)
        try:
            result_photo_info["tweet_time"] = int(time.mktime(time.strptime(str(tweet_time).strip(), "%Y-%m-%d %H:%M:%S")))
        except ValueError:
            raise crawler.CrawlerException("tweet发布时间文本格式不正确\n%s" % tweet_time)
        # 获取图片地址
        image_list_selector = photo_selector.find(".photo-link-outer a img")
        for image_index in range(0, image_list_selector.size()):
            image_url = image_list_selector.eq(image_index).attr("src")
            if not image_url:
                raise crawler.CrawlerException("图片列表截取图片地址失败\n%s" % image_list_selector.eq(image_index).html())
            result_photo_info["image_url_list"].append(str(image_url).strip())
        result["image_info_list"].append(result_photo_info)
    return result
Exemplo n.º 6
0
def get_account_index_page(account_name):
    account_index_url = "http://%s.pp.163.com/" % account_name
    account_index_response = net.http_request(account_index_url, method="GET")
    result = {
        "album_url_list": [],  # 全部相册地址
    }
    if account_index_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(account_index_response.status))
    # 页面编码
    account_index_html = account_index_response.data.decode("GBK").encode(
        "UTF-8")
    if account_index_html.find("<title>该页面不存在</title>") >= 0:
        raise crawler.CrawlerException("账号不存在")
    # 获取全部相册地址
    album_result_selector = PQ(account_index_html).find("#p_contents li")
    if album_result_selector.size() == 0:
        raise crawler.CrawlerException("页面匹配相册列表失败\n%s" % account_index_html)
    for album_index in range(0, album_result_selector.size()):
        result["album_url_list"].append(
            str(
                album_result_selector.eq(album_index).find("a.detail").attr(
                    "href")))
    return result
Exemplo n.º 7
0
def get_account_talks(account_id, account_name, talk_list):
    account_index = "https://7gogo.jp/users/%s" % account_id
    account_index_response = net.http_request(account_index, method="GET")
    if account_index_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(account_index_response.status))
    talk_list_selector = PQ(account_index_response.data.decode("UTF-8")).find(
        ".UserTalkWrapper .UserTalk")
    for talk_index in range(0, talk_list_selector.size()):
        talk_selector = talk_list_selector.eq(talk_index)
        # 获取talk地址
        talk_url_path = talk_selector.attr("href")
        if not talk_url_path:
            raise crawler.CrawlerException("talk信息截取talk地址失败\n%s" %
                                           talk_selector.html.encode("UTF-8"))
        talk_id = str(talk_url_path.replace("/", ""))
        if not talk_id:
            raise crawler.CrawlerException("talk地址截取talk id失败\n%s" %
                                           talk_url_path)
        # 获取talk名字
        talk_name = talk_selector.find(".UserTalk__talkname").text()
        if not talk_name:
            raise crawler.CrawlerException("talk信息截取talk名字失败\n%s" %
                                           talk_selector.html.encode("UTF-8"))
        talk_name = crawler.filter_emoji(
            str(talk_name.encode("UTF-8")).strip())
        # 获取talk描述
        talk_description = crawler.filter_emoji(
            talk_selector.find(".UserTalk__description").text())
        if talk_description:
            talk_description = crawler.filter_emoji(
                str(talk_description.encode("UTF-8")).strip())
        else:
            talk_description = ""
        if talk_id in talk_list:
            talk_list[talk_id]["account_list"].append(account_name)
        else:
            talk_list[talk_id] = {
                "account_list": [account_name],
                "talk_name": talk_name,
                "talk_description": talk_description,
            }
        output.print_msg(account_id + ": " + talk_name + ", " +
                         talk_description)
Exemplo n.º 8
0
def get_one_page_account(page_count):
    account_pagination_url = "http://jigadori.fkoji.com/users"
    query_data = {"p": page_count}
    account_pagination_response = net.http_request(account_pagination_url, method="GET", fields=query_data)
    pagination_account_list = {}
    if account_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        crawler.CrawlerException(crawler.request_failre(account_pagination_response.status))
    account_list_selector = PQ(account_pagination_response.data.decode("UTF-8")).find(".users-list li")
    for account_index in range(0, account_list_selector.size()):
        account_selector = account_list_selector.eq(account_index)
        # 获取成员名字
        account_name = account_selector.find(".profile-name").eq(0).text()
        if not account_name:
            account_name = ""
            # raise robot.CrawlerException("成员信息截取成员名字失败\n\%s" % account_selector.html().encode("UTF-8"))
        else:
            account_name = account_name.strip().encode("UTF-8")
        # 获取twitter账号
        account_id = account_selector.find(".screen-name a").text()
        if not account_id:
            raise crawler.CrawlerException("成员信息截取twitter账号失败\n\%s" % account_selector.html().encode("UTF-8"))
        account_id = account_id.strip().replace("@", "")
        pagination_account_list[account_id] = account_name
    return pagination_account_list
Exemplo n.º 9
0
def get_one_page_audio(account_id, page_count):
    # http://www.ximalaya.com/1014267/index_tracks?page=2
    audit_pagination_url = "http://www.ximalaya.com/%s/index_tracks" % account_id
    query_data = {"page": page_count}
    audit_pagination_response = net.http_request(audit_pagination_url,
                                                 method="GET",
                                                 fields=query_data,
                                                 json_decode=True)
    result = {
        "audio_info_list": [],  # 页面解析出的歌曲信息列表
        "is_over": False,  # 是不是最后一页
    }
    if audit_pagination_response.status == 404:
        raise crawler.CrawlerException("账号不存在")
    elif audit_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(audit_pagination_response.status))
    if not crawler.check_sub_key(
        ("res", "html"), audit_pagination_response.json_data):
        raise crawler.CrawlerException("返回数据'res'或'html'字段不存在\n%s" %
                                       audit_pagination_response.json_data)
    if audit_pagination_response.json_data["res"] is not True:
        raise crawler.CrawlerException("返回数据'res'字段取值不正确\n%s" %
                                       audit_pagination_response.json_data)
    # 获取歌曲信息
    audio_list_selector = PQ(audit_pagination_response.json_data["html"]).find(
        "ul.body_list li.item")
    for audio_index in range(0, audio_list_selector.size()):
        audio_info = {
            "audio_id": None,  # 页面解析出的歌曲id
            "audio_title": "",  # 页面解析出的歌曲标题
        }
        audio_selector = audio_list_selector.eq(audio_index)
        # 获取歌曲id
        audio_id = audio_selector.find(".content_wrap").attr("sound_id")
        if not crawler.is_integer(audio_id):
            raise crawler.CrawlerException(
                "歌曲信息匹配歌曲id失败\n%s" %
                audio_list_selector.html().encode("UTF-8"))
        audio_info["audio_id"] = str(audio_id)
        # 获取歌曲标题
        audio_title = audio_selector.find(".sound_title").attr("title")
        if not audio_title:
            raise crawler.CrawlerException(
                "歌曲信息匹配歌曲标题失败\n%s" %
                audio_list_selector.html().encode("UTF-8"))
        audio_info["audio_title"] = str(audio_title.encode("UTF-8").strip())
        result["audio_info_list"].append(audio_info)
    # 判断是不是最后一页
    max_page_count = 1
    pagination_list_selector = PQ(
        audit_pagination_response.json_data["html"]).find(
            ".pagingBar_wrapper a.pagingBar_page")
    for pagination_index in range(0, pagination_list_selector.size()):
        pagination_selector = pagination_list_selector.eq(pagination_index)
        data_page = pagination_selector.attr("data-page")
        if data_page is None:
            continue
        if not crawler.is_integer(data_page):
            raise crawler.CrawlerException(
                "分页信息匹配失败\n%s" % audio_list_selector.html().encode("UTF-8"))
        max_page_count = max(max_page_count, int(data_page))
    result["is_over"] = page_count >= max_page_count
    return result
Exemplo n.º 10
0
 def check(expected):
     response = self.client.get('/', follow=True)
     account = PyQuery(response.content)('ul.account')
     tools = PyQuery(response.content)('ul.tools')
     eq_(account.size(), expected)
     eq_(tools.size(), expected)
Exemplo n.º 11
0
def get_one_page_photo(page_count):
    photo_pagination_url = "http://kelagirls.com/bizhi!findForIndexMore.action"
    query_data = {"page": page_count}
    photo_pagination_response = net.http_request(photo_pagination_url,
                                                 method="GET",
                                                 fields=query_data)
    result = {
        "image_info_list": [],  # 全部图片地址
        "is_over": False,  # 是不是最后一页壁纸
    }
    if photo_pagination_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(photo_pagination_response.status))
    photo_list_selector = PQ(photo_pagination_response.data.decode(
        "UTF-8")).find(".bizhinmore .bizhi")
    if photo_list_selector.size() == 0:
        raise crawler.CrawlerException("页面匹配图片列失败\n%s" %
                                       photo_pagination_response.data)
    for photo_index in range(0, photo_list_selector.size()):
        result_image_info = {
            "image_id": None,  # 图片id
            "image_url": None,  # 图片地址
            "model_name": "",  # 模特名字
        }
        # 获取图片id
        image_id = photo_list_selector.eq(photo_index).find(
            ".bizhibigwrap").attr("id")
        if not image_id:
            raise crawler.CrawlerException(
                "图片列表匹配图片id失败\n%s" %
                photo_list_selector.eq(photo_index).html().encode("UTF-8"))
        if not (image_id[0:3] == "big" and crawler.is_integer(image_id[3:])):
            raise crawler.CrawlerException(
                "图片列表匹配的图片id格式不正确\n%s" %
                photo_list_selector.eq(photo_index).html().encode("UTF-8"))
        result_image_info["image_id"] = str(image_id[3:])
        # 获取图片地址
        image_path = photo_list_selector.eq(photo_index).find(
            ".bizhibig img").eq(1).attr("src")
        if not image_path:
            raise crawler.CrawlerException(
                "图片列表匹配图片地址失败\n%s" %
                photo_list_selector.eq(photo_index).html().encode("UTF-8"))
        result_image_info["image_url"] = "http://kelagirls.com/" + str(
            image_path.encode("UTF-8"))
        # 获取模特名字
        model_name = photo_list_selector.eq(photo_index).find(
            ".bzwdown span").eq(0).text().encode("UTF-8")
        if not model_name:
            raise crawler.CrawlerException(
                "图片列表匹配模特名字失败\n%s" %
                photo_list_selector.eq(photo_index).html().encode("UTF-8"))
        result_image_info["model_name"] = str(model_name)
        result["image_info_list"].append(result_image_info)
    # 判断是不是最后一页
    pagination_selector = PQ(
        photo_pagination_response.data.decode("UTF-8")).find(".pageBottom div")
    max_page_count = page_count
    for pagination_index in range(0, pagination_selector.size()):
        if crawler.is_integer(pagination_selector.eq(pagination_index).text()):
            max_page_count = max(
                max_page_count,
                int(pagination_selector.eq(pagination_index).text()))
    result["is_over"] = page_count >= max_page_count
    return result