Exemplos de find_sub_string em Python, exemplos de common.tool.find_sub_string em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: article.py Projeto: yxw19870806/yxw19870806

def get_article_title(article_page, article_type):
    if article_type == "t":
        return tool.find_sub_string(article_page, '<div class="title" node-type="articleTitle">', "</div>")
    elif article_type == "p":
        return tool.find_sub_string(article_page, '<h1 class=\\"title\\">', "<\\/h1>")
    else:
        return None

Exemplo n.º 2

0

Exibir arquivo

def get_account_from_index():
    index_url = "http://www.keyakizaka46.com/mob/news/diarShw.php"
    query_data = {"cd": "member"}
    index_response = net.http_request(index_url,
                                      method="GET",
                                      fields=query_data)
    account_list = {}
    if index_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(index_response.status))
    member_list_data = tool.find_sub_string(index_response.data,
                                            '<ul class="thumb">', "</ul>")
    if not member_list_data:
        raise crawler.CrawlerException("页面截取账号列表失败\n%s" % index_response.data)
    member_list_find = re.findall("<li ([\S|\s]*?)</li>", member_list_data)
    for member_info in member_list_find:
        # 获取账号id
        account_id = tool.find_sub_string(member_info, "&ct=", '">')
        if not account_id:
            raise crawler.CrawlerException("账号信息截取账号id失败\n%s" % member_info)
        # 获取成员名字
        account_name = tool.find_sub_string(member_info, '<p class="name">',
                                            "</p>").strip().replace(" ", "")
        if not account_name:
            raise crawler.CrawlerException("账号信息截取成员名字失败\n%s" % member_info)
        account_list[account_id] = account_name
    return account_list

Exemplo n.º 3

0

Exibir arquivo

Arquivo: get48GroupMember.py Projeto: yxw19870806/PyCrawler

def jkt(file_handle):
    index_url = "http://www.jkt48.com/member/list"
    index_response = net.http_request(index_url, method="GET")
    if index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        page = tool.find_sub_string(index_response.data, '<div id="mainCol">',
                                    "<!--end #mainCol-->", 1)
        start_index = 0
        start_index_list = []
        while start_index != -1:
            start_index = page.find('<a name="', start_index + 1)
            start_index_list.append(start_index)
        for i in range(0, len(start_index_list) - 1):
            start = start_index_list[i]
            end = start_index_list[i + 1]
            if end == -1:
                end = len(page)
            split_page = page[start:end]
            team_name = tool.find_sub_string(split_page, "<h2>", "</h2>")
            if team_name.find("Team") == -1:
                team_name = "Team kenkyusei"
            team_name = "JKT48 " + team_name
            member_list = re.findall(
                '<div class="profileWrap">([\s|\S]*?)</div><!--/loop-->',
                split_page)
            for member in member_list:
                member = member.replace("<br>", "").replace("\n", "").replace(
                    "\r", "").replace("\t", "")
                japanese_name = english_name = tool.find_sub_string(
                    member, 'alt="', '"')
                file_handle.write(japanese_name + "\t" + english_name + "\t" +
                                  team_name + "\n")

Exemplo n.º 4

0

Exibir arquivo

Arquivo: get48GroupMember.py Projeto: yxw19870806/yxw19870806

def ske(file_handle):
    split_list = {
        "SKE48 Team S": ("<!-- LIST - TEAM S -->", "<!-- //LIST - TEAM S -->"),
        "SKE48 Team KII": ("<!-- LIST - TEAM KII -->", "<!-- //LIST - TEAM KII -->"),
        "SKE48 Team E": ("<!-- LIST - TEAM E -->", "<!-- //LIST - TEAM E -->"),
        "SKE48 Team Kenkyusei": ("<!-- LIST - KENKYUSEI -->", "<!-- //LIST - KENKYUSEI -->")
    }
    index_url = "http://www.ske48.co.jp/profile/list.php"
    return_code, page = tool.http_request(index_url)[:2]
    if return_code == 1:
        for team_name in split_list:
            team_page = tool.find_sub_string(page, split_list[team_name][0], split_list[team_name][1])
            member_list = re.findall('<dl>([\s|\S]*?)</dl>', team_page)
            for member in member_list:
                member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "")
                japanese_name_find = re.findall('<h3><a href="./\?id=[^"]*">([^<]*)</a></h3>', member)
                english_name = tool.find_sub_string(member, '<h3 class="en">', '</h3>')
                plus_text = tool.find_sub_string(member, '<li class="textPlus">', '</li>')
                if len(japanese_name_find) != 1:
                    print "error japanese_name_find"
                    continue
                if not english_name:
                    print "error english_name"
                    continue

                japanese_name = japanese_name_find[0].replace(" ", "")
                first_name, last_name = english_name.strip().title().split(" ", 1)
                if plus_text and plus_text.find("兼任") > 0:
                    team = team_name + " / " + plus_text.split("/")[-1].strip().replace("チーム", " Team ").replace("兼任", "")
                else:
                    team = team_name

                file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n")

Exemplo n.º 5

0

Exibir arquivo

Arquivo: get48GroupMember.py Projeto: yxw19870806/yxw19870806

def akb(file_handle):
    for team_id in [1, 2, 3, 4, 12]:
        index_url = "http://www.akb48.co.jp/about/members/?team_id=" + str(team_id)
        return_code, page = tool.http_request(index_url)[:2]
        if return_code == 1:
            member_list_page = tool.find_sub_string(page, '<ul class="memberListUl">', '</ul>')
            if member_list_page:
                member_list = re.findall("<li>([\s|\S]*?)</li>", member_list_page)
                for member in member_list:
                    member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "")
                    japanese_name = tool.find_sub_string(member, '<h4 class="memberListNamej">', '</h4>')
                    english_name = tool.find_sub_string(member, '<p class="memberListNamee">', '</p>')
                    team_find = re.findall('<h5 class="memberListTeam">([^<]*)</h5>', member)
                    if not japanese_name:
                        print "error japanese_name"
                        continue
                    if not english_name:
                        print "error english_name"
                        continue
                    if (team_id != 12 and len(team_find) != 1) or (team_id == 12 and len(team_find) != 2):
                        print "error team_find"
                        continue

                    japanese_name = japanese_name.replace(" ", "")
                    first_name, last_name = english_name.split(" ", 1)
                    team = team_find[0].strip().replace("  /", " / ")

                    file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n")
            else:
                print "error member_list_page"

Exemplo n.º 6

0

Exibir arquivo

def get_game_invalid_achievements(game_id):
    game_index_url = "http://astats.astats.nl/astats/Steam_Game_Info.php"
    query_data = {"AppID": game_id}
    game_index_response = net.http_request(game_index_url,
                                           method="GET",
                                           fields=query_data)
    if game_index_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        output.print_msg("游戏 %s 访问失败" % game_id)
        tool.process_exit()
    # game id 不存在
    if game_index_response.data.find(
            "This game cannot be found in the database.") >= 0:
        return
    achievement_text = tool.find_sub_string(
        game_index_response.data,
        '<span class="GameInfoBoxRow">Achievements</span><br>', "</td>")
    # 没有成就
    if not achievement_text:
        return
    achievement_text = achievement_text.strip()
    if not crawler.is_integer(achievement_text):
        invalid_achievement_text = tool.find_sub_string(
            achievement_text, '<font color="#FF0000">', "</font>")
        if invalid_achievement_text:
            output.print_msg("游戏 %s, 存在无效成就，%s" %
                             (game_id, invalid_achievement_text))
        else:
            output.print_msg("游戏 %s, 存在未知成就文字：%s" %
                             (game_id, invalid_achievement_text))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: get48GroupMember.py Projeto: yxw19870806/yxw19870806

def jkt(file_handle):
    index_url = "http://www.jkt48.com/member/list"
    return_code, page = tool.http_request(index_url)[:2]
    if return_code == 1:
        page = tool.find_sub_string(page, '<div id="mainCol">', "<!--end #mainCol-->", 1)
        start_index = 0
        start_index_list = []
        while start_index != -1:
            start_index = page.find('<a name="', start_index + 1)
            start_index_list.append(start_index)
        for i in range(0, len(start_index_list) - 1):
            start = start_index_list[i]
            end = start_index_list[i + 1]
            if end == -1:
                end = len(page)
            split_page = page[start: end]
            team_name = tool.find_sub_string(split_page, "<h2>", "</h2>")
            if team_name.find("Team") == -1:
                team_name = "Team kenkyusei"
            team_name = "JKT48 " + team_name
            member_list = re.findall('<div class="profileWrap">([\s|\S]*?)</div><!--/loop-->',split_page)
            for member in member_list:
                member = member.replace("<br>", "").replace("\n", "").replace("\r", "").replace("\t", "")
                japanese_name = english_name = tool.find_sub_string(member, 'alt="', '"')

                file_handle.write(japanese_name + "\t" + english_name + "\t" + team_name + "\n")

Exemplo n.º 8

0

Exibir arquivo

Arquivo: flickr.py Projeto: yxw19870806/yxw19870806

def get_api_info(account_name):
    photo_index_url = "https://www.flickr.com/photos/%s" % account_name
    photo_index_return_code, photo_index_page = tool.http_request(photo_index_url)[:2]
    if photo_index_return_code == 1:
        user_id = tool.find_sub_string(photo_index_page, '"nsid":"', '"')
        site_key = tool.find_sub_string(photo_index_page, '"site_key":"', '"')
        return {"user_id": user_id, "site_key": site_key}
    return None

Exemplo n.º 9

0

Exibir arquivo

Arquivo: article.py Projeto: yxw19870806/yxw19870806

def get_article_id(article_url):
    article_id = tool.find_sub_string(article_url, "http://weibo.com/ttarticle/p/show?id=", "&mod=zwenzhang")
    if article_id:
        return "t_%s" % article_id
    else:
        article_id = tool.find_sub_string(article_url, "http://weibo.com/p/", "?mod=zwenzhang")
        if article_id:
            return "p_%s" % article_id
    return None

Exemplo n.º 10

0

Exibir arquivo

Arquivo: discuz.py Projeto: yxw19870806/yxw19870806

def get_thread_author_post(thread_url):
    thread_return_code, thread_page, thread_response = tool.http_request(thread_url)
    if thread_return_code == 1:
        content_type = tool.get_response_info(thread_response.info(), "Content-Type")
        charset = tool.find_sub_string(content_type, "charset=")
        post_message = tool.find_sub_string(thread_page, '<td class="t_f" id="postmessage_', '<div id="comment_')
        post_message = post_message[post_message.find('">') + 2: post_message.rfind("</td>")]
        return post_message.decode(charset)
    return None

Exemplo n.º 11

0

Exibir arquivo

Arquivo: ameblo.py Projeto: yxw19870806/yxw19870806

def get_blog_time(blog_page):
    blog_time_info = tool.find_sub_string(blog_page, '<span class="articleTime">', "</span>")
    if blog_time_info:
        blog_time_string = tool.find_sub_string(blog_page, 'pubdate="pubdate">', "</time>").strip()
    else:
        blog_time_string = tool.find_sub_string(blog_page, '<span class="date">', "</span>").strip()
    if blog_time_string:
        blog_timestamp = time.strptime(blog_time_string, "%Y-%m-%d %H:%M:%S")
        # 显示时间对应的时间戳，服务器的时区（日本），不对本地时间做转换
        return int(time.mktime(blog_timestamp))
    return None

Exemplo n.º 12

0

Exibir arquivo

Arquivo: ameblo.py Projeto: yxw19870806/yxw19870806

def get_image_url_list(blog_page):
    article_data = tool.find_sub_string(blog_page, '<div class="articleText">', "<!--entryBottom-->", 1)
    if not article_data:
        article_data = tool.find_sub_string(blog_page, '<div class="subContentsInner">', "<!--entryBottom-->", 1)
    image_url_list_find = re.findall('<img [\S|\s]*?src="([^"]*)" [\S|\s]*?>', article_data)
    image_url_list = []
    for image_url in image_url_list_find:
        # 过滤表情
        if image_url.find(".ameba.jp/blog/ucs/") == -1:
            image_url_list.append(image_url)
    return image_url_list

Exemplo n.º 13

0

Exibir arquivo

Arquivo: article.py Projeto: yxw19870806/yxw19870806

def get_article_image_url_list(article_page, article_type):
    if article_type == "t":
        article_body = tool.find_sub_string(article_page, '<div class="WB_editor_iframe', '<div class="artical_add_box')
    elif article_type == "p":
        article_body = tool.find_sub_string(article_page, '{"ns":"pl.content.longFeed.index"', "</script>")
        article_body = article_body.replace("\\", "")
    else:
        return None
    if article_body:
        return re.findall('<img[^>]* src="([^"]*)"[^>]*>', article_body)
    return None

Exemplo n.º 14

0

Exibir arquivo

Arquivo: discuz.py Projeto: zhoushuqiang/PyCrawler

def get_thread_author_post(thread_url):
    thread_response = net.http_request(thread_url, method="GET")
    if thread_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        post_message = tool.find_sub_string(thread_response.data, '<td class="t_f" id="postmessage_', '<div id="comment_')
        post_message = post_message[post_message.find('">') + 2: post_message.rfind("</td>")]
        content_type = thread_response.getheader("Content-Type")
        if content_type is None:
            return post_message
        charset = tool.find_sub_string(content_type, "charset=")
        return post_message.decode(charset)

    return None

Exemplo n.º 15

0

Exibir arquivo

Arquivo: getMemberList.py Projeto: yxw19870806/yxw19870806

def get_member_list():
    index_url = "http://www.keyakizaka46.com/mob/news/diarShw.php?cd=member"
    index_return_code, index_page = tool.http_request(index_url)[:2]
    if index_return_code:
        member_list_data = tool.find_sub_string(index_page, '<ul class="thumb">', "</ul>")
        if member_list_data:
            member_list_find = re.findall("<li ([\S|\s]*?)</li>", member_list_data)
            for member_info in member_list_find:
                ct = tool.find_sub_string(member_info, "&ct=", '">')
                name = tool.find_sub_string(member_info, '<p class="name">', "</p>").strip().replace(" ", "")
                tool.print_msg("%s\t\t\t%s" % (ct, name), False)
            if len(member_list_find) > 0:
                tool.print_msg("复制以上内容到save.data中，删除不需要的行，即可开始运行", False)
    return None

Exemplo n.º 16

0

Exibir arquivo

Arquivo: get48GroupMember.py Projeto: yxw19870806/yxw19870806

def hkt(file_handle):
    index_url = "http://www.hkt48.jp/profile/"
    return_code, page = tool.http_request(index_url)[:2]
    if return_code == 1:
        team_find = re.findall('(<h3>[\s|\S]*?)<!-- / .contsbox --></div>', page)
        for team_page in team_find:
            team = tool.find_sub_string(team_page, "<h3>", "</h3>")
            if not team:
                print "error team"
                continue
            team = team.strip()
            member_list = re.findall("<li>([\s|\S]*?)</li>", team_page)
            for member in member_list:
                member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "")
                name_find = re.findall('''<a href="/profile/[\d]*"><img src="[^"]*" alt="[^"]*" width="120" height="150" /><span class='name_j'>([^"]*)</span><span class='name_e'>([^<]*)</span></a> ''', member)
                if len(name_find) != 1:
                    print "error name_find"
                    continue
                japanese_name, english_name = name_find[0]
                team_plus_find = re.findall('<div class="team_j">([^<]*)</div>', member)
                team_name = team
                if len(team_plus_find) == 1:
                    if team_plus_find[0].find("兼任") >= 0:
                        team_name = team + " / " + team_plus_find[0].split("/")[-1].strip().replace("兼任", "")
                japanese_name = japanese_name.replace(" ", "")
                first_name, last_name = english_name.strip().title().split(" ", 1)

                file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team_name + "\n")

Exemplo n.º 17

0

Exibir arquivo

Arquivo: googlePlus.py Projeto: yxw19870806/yxw19870806

def get_picasaweb_page_album_id(account_id, picasaweb_url):
    message_page_return_code, message_page = tool.http_request(picasaweb_url)[:2]
    if message_page_return_code == 1:
        # 查找picasaweb页的album id
        album_archive_url = "https://get.google.com/albumarchive/pwa/%s/album/" % account_id
        return tool.find_sub_string(message_page, 'href="%s' % album_archive_url, '"')
    return None

Exemplo n.º 18

0

Exibir arquivo

Arquivo: blog.py Projeto: yxw19870806/yxw19870806

def get_one_page_blog(account_id, page_count):
    # http://blog.nogizaka46.com/asuka.saito
    blog_url = "http://blog.nogizaka46.com/%s/?p=%s" % (account_id, page_count)
    blog_return_code, blog_page = tool.http_request(blog_url)[:2]
    if blog_return_code == 1:
        return tool.find_sub_string(blog_page, '<div class="paginate">', '<div class="paginate">', 1)
    return None

Exemplo n.º 19

0

Exibir arquivo

Arquivo: article.py Projeto: yxw19870806/yxw19870806

def get_article_url(preview_article_data):
    page_route = tool.find_sub_string(preview_article_data, '<a target=\\"_blank\\" href=\\"', '\\">')
    page_route = page_route.replace("\\/", "/").replace("&amp;", "&")
    if page_route:
        return "http://weibo.com" + page_route
    else:
        return None

Exemplo n.º 20

0

Exibir arquivo

Arquivo: bcy.py Projeto: yxw19870806/yxw19870806

def get_max_page_count(coser_id, post_page):
    max_page_count = tool.find_sub_string(post_page, '<a href="/u/%s/post/cos?&p=' % coser_id, '">')
    if max_page_count:
        max_page_count = int(max_page_count)
    else:
        max_page_count = 1
    return max_page_count

Exemplo n.º 21

0

Exibir arquivo

Arquivo: get48GroupMember.py Projeto: yxw19870806/PyCrawler

def ske(file_handle):
    split_list = {
        "SKE48 Team S": ("<!-- LIST - TEAM S -->", "<!-- //LIST - TEAM S -->"),
        "SKE48 Team KII":
        ("<!-- LIST - TEAM KII -->", "<!-- //LIST - TEAM KII -->"),
        "SKE48 Team E": ("<!-- LIST - TEAM E -->", "<!-- //LIST - TEAM E -->"),
        "SKE48 Team Kenkyusei":
        ("<!-- LIST - KENKYUSEI -->", "<!-- //LIST - KENKYUSEI -->")
    }
    index_url = "http://www.ske48.co.jp/profile/list.php"
    index_response = net.http_request(index_url, method="GET")
    if index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        for team_name in split_list:
            team_page = tool.find_sub_string(index_response.data,
                                             split_list[team_name][0],
                                             split_list[team_name][1])
            member_list = re.findall("<dl>([\s|\S]*?)</dl>", team_page)
            for member in member_list:
                member = member.replace("<br />",
                                        "").replace("\n", "").replace(
                                            "\r", "").replace("\t", "")
                japanese_name_find = re.findall(
                    '<h3><a href="./\?id=[^"]*">([^<]*)</a></h3>', member)
                english_name = tool.find_sub_string(member, '<h3 class="en">',
                                                    "</h3>")
                plus_text = tool.find_sub_string(member,
                                                 '<li class="textPlus">',
                                                 "</li>")
                if len(japanese_name_find) != 1:
                    output.print_msg("error japanese_name_find")
                    continue
                if not english_name:
                    output.print_msg("error english_name")
                    continue

                japanese_name = japanese_name_find[0].replace(" ", "")
                first_name, last_name = english_name.strip().title().split(
                    " ", 1)
                if plus_text and plus_text.find("兼任") > 0:
                    team = team_name + " / " + plus_text.split("/")[-1].strip(
                    ).replace("チーム", " Team ").replace("兼任", "")
                else:
                    team = team_name

                file_handle.write(japanese_name + "\t" + last_name + " " +
                                  first_name + "\t" + team + "\n")

Exemplo n.º 22

0

Exibir arquivo

Arquivo: miaopai.py Projeto: yxw19870806/yxw19870806

def get_suid(account_id):
    index_page_url = "http://www.miaopai.com/u/paike_%s" % account_id
    index_page_return_code, index_page = tool.http_request(index_page_url)[:2]
    if index_page_return_code == 1:
        suid = tool.find_sub_string(index_page, '<button class="guanzhu gz" suid="', '" heade="1" token="">+关注</button>')
        if suid:
            return suid
    return None

Exemplo n.º 23

0

Exibir arquivo

Arquivo: changba.py Projeto: yxw19870806/yxw19870806

def get_user_id(account_id):
    index_url = "http://changba.com/u/%s" % account_id
    index_return_code, index_page = tool.http_request(index_url)[:2]
    if index_return_code == 1:
        user_id = tool.find_sub_string(index_page, "var userid = '", "'")
        if user_id:
            return user_id
    return None

Exemplo n.º 24

0

Exibir arquivo

Arquivo: twitter.py Projeto: yxw19870806/yxw19870806

def get_account_id(account_name):
    account_index_url = "https://twitter.com/%s" % account_name
    account_index_return_code, account_index_page = tool.http_request(account_index_url)[:2]
    if account_index_return_code == 1:
        account_id = tool.find_sub_string(account_index_page, '<div class="ProfileNav" role="navigation" data-user-id="', '">')
        if account_id:
            return account_id
    return None

Exemplo n.º 25

0

Exibir arquivo

Arquivo: get48GroupMember.py Projeto: yxw19870806/PyCrawler

def nmb(file_handle):
    team_list = {
        "teamn": "NMB48 Team N",
        "teamm": "NMB48 Team M",
        "teamb2": "NMB48 Team BII",
        "dkenkyusei": "NMB48 Team Kenkyusei",
        "kenkyusei": "NMB48 Team Kenkyusei",
    }
    index_url = "http://www.nmb48.com/member/"
    index_response = net.http_request(index_url, method="GET")
    if index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        team_page_list = re.findall(
            "<!--▼チーム別領域ボックス▼-->([\s|\S]*?)<!--▲チーム別領域ボックス▲--> ",
            index_response.data)
        for team_page in team_page_list:
            team_find = tool.find_sub_string(team_page, '<a name="', '"></a>')
            if team_find:
                if team_find not in team_list:
                    output.print_msg("not found %s in team_list" % team_find)
                    continue
                member_list = re.findall(
                    '<li class="member-box[^"]*">([\s|\S]*?)</li>', team_page)
                for member in member_list:
                    member = member.replace("<br />", "").replace(
                        "\n",
                        "").replace("\r",
                                    "").replace("\t",
                                                "").replace("&nbsp;", " ")
                    japanese_name_find = re.findall(
                        '<h4><a href="[^"]*">([^<]*)</a></h4>', member)
                    english_name_find = re.findall(
                        "<p[\s|\S]*?>([\s|\S]*?)</[p|a]>", member)
                    if len(japanese_name_find) != 1:
                        output.print_msg("error japanese_name_find")
                        continue
                    if len(english_name_find) != 1:
                        output.print_msg("error english_name_find")
                        continue

                    team = team_list[team_find]
                    if english_name_find[0].find("<span>") >= 0:
                        temp = english_name_find[0].split("<span>")
                        english_name_find[0] = temp[0]
                        temp[1] = temp[1].replace("</span>", "")
                        if temp[1].find("研究生") == -1:
                            team += " / " + temp[1].split("/")[-1].strip()
                    japanese_name = japanese_name_find[0].replace("　",
                                                                  " ").replace(
                                                                      " ", "")
                    first_name, last_name = english_name_find[0].strip().title(
                    ).split(" ", 1)

                    file_handle.write(japanese_name + "\t" + last_name + " " +
                                      first_name + "\t" + team + "\n")
            else:
                output.print_msg("error team_find")

Exemplo n.º 26

0

Exibir arquivo

Arquivo: get48GroupMember.py Projeto: yxw19870806/PyCrawler

def akb(file_handle):
    for team_id in [1, 2, 3, 4, 12]:
        member_index_url = "http://www.akb48.co.jp/about/members/"
        query_data = {"team_id": team_id}
        member_index_response = net.http_request(member_index_url,
                                                 method="GET",
                                                 fields=query_data)
        if member_index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
            member_list_page = tool.find_sub_string(
                member_index_response.data, '<ul class="memberListUl">',
                "</ul>")
            if member_list_page:
                member_list = re.findall("<li>([\s|\S]*?)</li>",
                                         member_list_page)
                for member in member_list:
                    member = member.replace("<br />",
                                            "").replace("\n", "").replace(
                                                "\r", "").replace("\t", "")
                    japanese_name = tool.find_sub_string(
                        member, '<h4 class="memberListNamej">', "</h4>")
                    english_name = tool.find_sub_string(
                        member, '<p class="memberListNamee">', "</p>")
                    team_find = re.findall(
                        '<h5 class="memberListTeam">([^<]*)</h5>', member)
                    if not japanese_name:
                        output.print_msg("error japanese_name")
                        continue
                    if not english_name:
                        output.print_msg("error english_name")
                        continue
                    if (team_id != 12 and len(team_find) != 1) or (
                            team_id == 12 and len(team_find) != 2):
                        output.print_msg("error team_find")
                        continue

                    japanese_name = japanese_name.replace(" ", "")
                    first_name, last_name = english_name.split(" ", 1)
                    team = team_find[0].strip().replace("  /", " / ")

                    file_handle.write(japanese_name + "\t" + last_name + " " +
                                      first_name + "\t" + team + "\n")
            else:
                output.print_msg("error member_list_page")

Exemplo n.º 27

0

Exibir arquivo

Arquivo: blog.py Projeto: yxw19870806/yxw19870806

def check_big_image(image_url, big_2_small_list):
    if image_url in big_2_small_list:
        big_image_display_page_return_code, big_image_display_page = tool.http_request(big_2_small_list[image_url])[:2]
        if big_image_display_page_return_code == 1:
            temp_image_url = tool.find_sub_string(big_image_display_page, '<img src="', '"')
            if temp_image_url != "/img/expired.gif":
                return temp_image_url, False
            else:
                return image_url, True  # 如果有发现一个已经过期的图片，那么再往前的图片也是过期的，不用再检查了
    return image_url, False

Exemplo n.º 28

0

Exibir arquivo

Arquivo: meituzz.py Projeto: yxw19870806/yxw19870806

def get_image_url_list(album_page):
    image_url_list_find = tool.find_sub_string(album_page, '<input type="hidden" id="imageList" value=', ' />')
    try:
        image_url_list_find = json.loads(image_url_list_find)
    except ValueError:
        return None
    image_url_list = []
    for temp_image_list in image_url_list_find:
        image_url_list += temp_image_list
    return image_url_list

Exemplo n.º 29

0

Exibir arquivo

Arquivo: diary.py Projeto: yxw19870806/yxw19870806

def get_one_page_diary_data(account_id, page_count):
    # http://www.keyakizaka46.com/mob/news/diarKiji.php?cd=member&ct=01&page=0&rw=20
    diary_page_url = "http://www.keyakizaka46.com/mob/news/diarKiji.php"
    diary_page_url += "?cd=member&ct=%02d&page=%s&rw=%s" % (int(account_id), page_count - 1, IMAGE_COUNT_PER_PAGE)
    diary_return_code, diary_page = tool.http_request(diary_page_url)[:2]
    if diary_return_code == 1:
        diary_page = tool.find_sub_string(diary_page, '<div class="box-main">', '<div class="box-sideMember">')
        if diary_page:
            return re.findall("<article>([\s|\S]*?)</article>", diary_page)
    return None

Exemplo n.º 30

0

Exibir arquivo

Arquivo: article.py Projeto: yxw19870806/yxw19870806

def get_account_page_id(account_id):
    for i in range(0, 50):
        index_url = "http://weibo.com/u/%s?is_all=1" % account_id
        index_page = auto_redirect_visit(index_url)
        if index_page:
            account_page_id = tool.find_sub_string(index_page, "$CONFIG['page_id']='", "'")
            if account_page_id:
                return account_page_id
        time.sleep(5)
    return None

Exemplo n.º 31

0

Exibir arquivo

Arquivo: discuz.py Projeto: yxw19870806/yxw19870806

def get_one_forum_page_thread_url_list(forum_url):
    forum_return_code, forum_page = tool.http_request(forum_url)[:2]
    if forum_return_code == 1:
        forum_page = tool.find_sub_string(forum_page, '<div id="threadlist"', '<div id="filter_special_menu"', 1)
        thread_find = re.findall('<a href="(thread-\d*-1-1.\w*)" onclick="atarget\(this\)" class="s xst">([\S|\s]*?)</a>', forum_page)
        host = forum_url[0: forum_url.rfind("/") + 1]
        thread_url_list = {}
        for forum_path, forum_name in thread_find:
            thread_url_list[host + forum_path] = forum_name
        return thread_url_list
    return None

Exemplo n.º 32

0

Exibir arquivo

Arquivo: twitter.py Projeto: yxw19870806/yxw19870806

def get_video_url_list(tweet_id):
    video_page_url = "https://twitter.com/i/videos/tweet/%s" % tweet_id
    video_page_return_code, video_page = tool.http_request(video_page_url)[:2]
    if video_page_return_code == 1:
        m3u8_file_url = tool.find_sub_string(video_page, "&quot;video_url&quot;:&quot;", "&quot;")
        if m3u8_file_url:
            m3u8_file_url = m3u8_file_url.replace("\\/", "/")
            ts_url_list = []
            get_ts_url_list(m3u8_file_url, ts_url_list)
            return "ts", ts_url_list
        vmap_file_url = tool.find_sub_string(video_page, "&quot;vmap_url&quot;:&quot;", "&quot;")
        if vmap_file_url:
            vmap_file_url = vmap_file_url.replace("\\/", "/")
            vmap_file_return_code, vmap_file = tool.http_request(vmap_file_url)[:2]
            if vmap_file_return_code:
                media_file_url = tool.find_sub_string(vmap_file, "<![CDATA[", "]]>")
                if media_file_url:
                    file_type = media_file_url.split(".")[-1].split("?")[0]
                    return file_type, media_file_url
    return "", []

Exemplo n.º 33

0

Exibir arquivo

Arquivo: discuz.py Projeto: zhoushuqiang/PyCrawler

def get_one_forum_page_thread_url_list(forum_url):
    forum_response = net.http_request(forum_url, method="GET")
    if forum_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        forum_page = tool.find_sub_string(forum_response.data, '<div id="threadlist"', '<div id="filter_special_menu"', 1)
        thread_find = re.findall('<a href="(thread-\d*-1-1.\w*)" onclick="atarget\(this\)" class="s xst">([\S|\s]*?)</a>', forum_page)
        host = forum_url[0: forum_url.rfind("/") + 1]
        thread_url_list = {}
        for forum_path, forum_name in thread_find:
            thread_url_list[host + forum_path] = forum_name
        return thread_url_list
    return None

Exemplo n.º 34

0

Exibir arquivo

Arquivo: instagram.py Projeto: yxw19870806/yxw19870806

def set_csrf_token():
    global CSRF_TOKEN
    index_url = "https://www.instagram.com/instagram"
    index_page_response = tool.http_request(index_url)
    if index_page_response[0] == 1:
        set_cookie_info = tool.get_response_info(index_page_response[2].info(), "Set-Cookie")
        if set_cookie_info is not None:
            csrf_token = tool.find_sub_string(set_cookie_info, "csrftoken=", ";")
            if csrf_token:
                CSRF_TOKEN = csrf_token
                return True
    return False

Exemplo n.º 35

0

Exibir arquivo

Arquivo: meipai.py Projeto: yxw19870806/yxw19870806

def get_follow_list(account_id):
    max_page_count = 1
    page_count = 1
    follow_list = {}
    while page_count <= max_page_count:
        follow_list_url = "http://www.meipai.com/user/%s/friends?p=%s" % (account_id, page_count)
        follow_list_page_return_code, follow_list_page = tool.http_request(follow_list_url)[:2]
        if follow_list_page_return_code == 1:
            follow_list_find = re.findall('<div class="ucard-info">([\s|\S]*?)</div>', follow_list_page)
            for follow_info in follow_list_find:
                follow_account_id = tool.find_sub_string(follow_info, '<a hidefocus href="/user/', '"').strip()
                follow_account_name = tool.find_sub_string(follow_info, 'title="', '"')
                follow_list[follow_account_id] = follow_account_name
            if max_page_count == 1:
                page_info = tool.find_sub_string(follow_list_page, '<div class="paging-wrap">', '</div>')
                if page_info:
                    page_find = re.findall("friends\?p=(\d*)", page_info)
                    page_find = [int(i) for i in page_find]
                    max_page_count = max(page_find)
            page_count += 1
        else:
            return None
    return follow_list

Exemplo n.º 36

0

Exibir arquivo

Arquivo: tumblr.py Projeto: yxw19870806/yxw19870806

def get_post_page_head(post_url, postfix_list):
    post_page_return_code, post_page_data = tool.http_request(post_url)[:2]
    # 不带后缀的可以访问，则直接返回页面
    # 如果无法访问，则依次访问带有后缀的页面
    if post_page_return_code != 1:
        for postfix in postfix_list:
            temp_post_url = post_url + "/" + urllib2.quote(postfix)
            post_page_return_code, post_page_data = tool.http_request(temp_post_url)[:2]
            if post_page_return_code == 1:
                break
    if post_page_data is not None:
        return tool.find_sub_string(post_page_data, "<head", "</head>", 3)
    else:
        return None

Exemplo n.º 37

0

Exibir arquivo

Arquivo: nicoNico.py Projeto: yxw19870806/yxw19870806

def get_video_info_list(account_id):
    # http://www.nicovideo.jp/mylist/15614906#+page=1
    video_page_url = "http://www.nicovideo.jp/mylist/%s" % account_id
    video_page_return_code, video_page = tool.http_request(video_page_url)[:2]
    if video_page_return_code == 1:
        video_data = tool.find_sub_string(video_page, "Mylist.preload(%s," % account_id, ");").strip()
        try:
            video_data = json.loads(video_data)
        except ValueError:
            pass
        else:
            # 倒序排列，时间越晚的越前面
            video_data.reverse()
            return video_data
    return None

Exemplo n.º 38

0

Exibir arquivo

Arquivo: steam.py Projeto: yxw19870806/yxw19870806

def get_discount_list():
    page_count = 1
    total_page_count = 99
    discount_list = []
    app_id_list = []
    while page_count <= total_page_count:
        index_url = "http://store.steampowered.com/search/results"
        index_url += "?sort_by=Price_ASC&category1=998&os=win&specials=1&page=%s" % page_count
        index_page_return_code, index_page = tool.http_request(index_url)[:2]
        if index_page_return_code != 1:
            break
        items_page = tool.find_sub_string(index_page, "<!-- List Items -->", "<!-- End List Items -->")
        items_page = tool.find_sub_string(items_page, "<a href=", None)
        items_page = items_page.replace("\n", "").replace("\r", "").replace("<a href=", "\n<a href=")
        items = items_page.split("\n")
        for item in items:
            app_id = tool.find_sub_string(item, 'data-ds-appid="', '"')
            discount_data = tool.find_sub_string(item, '<div class="col search_discount responsive_secondrow">', "</div>")
            discount = tool.find_sub_string(discount_data, "<span>", "</span>").replace("-", "").replace("%", "")
            if not discount:
                discount = 0
            price_data = tool.find_sub_string(item, '<div class="col search_price discounted responsive_secondrow">', "</div>", 2)
            old_price = tool.find_sub_string(price_data, '<strike>', '</strike>').replace("¥", "").strip()
            if not old_price:
                old_price = 0
            new_price = tool.find_sub_string(price_data, '<br>', '</div>').replace("¥", "").strip()
            if not new_price or not new_price.isdigit():
                new_price = 0
            if app_id not in app_id_list:
                discount_list.append("%s\t%s\t%s\t%s" % (app_id, discount, old_price, new_price))
                app_id_list.append(app_id)
        if total_page_count == 99:
            pagination_page = tool.find_sub_string(index_page, '<div class="search_pagination">', None)
            page_find = re.findall('return false;">([\d]*)</a>', pagination_page)
            if len(page_find) > 0:
                total_page_count = 0
                for page_id in page_find:
                    total_page_count = max(total_page_count, int(page_id))
        page_count += 1
    return discount_list

Exemplo n.º 39

0

Exibir arquivo

Arquivo: get48GroupMember.py Projeto: yxw19870806/PyCrawler

def hkt(file_handle):
    index_url = "http://www.hkt48.jp/profile/"
    index_response = net.http_request(index_url, method="GET")
    if index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
        team_find = re.findall("(<h3>[\s|\S]*?)<!-- / .contsbox --></div>",
                               index_response.data)
        for team_page in team_find:
            team = tool.find_sub_string(team_page, "<h3>", "</h3>")
            if not team:
                output.print_msg("error team")
                continue
            team = team.strip()
            member_list = re.findall("<li>([\s|\S]*?)</li>", team_page)
            for member in member_list:
                member = member.replace("<br />",
                                        "").replace("\n", "").replace(
                                            "\r", "").replace("\t", "")
                name_find = re.findall(
                    """<a href="/profile/[\d]*"><img src="[^"]*" alt="[^"]*" width="120" height="150" /><span class='name_j'>([^"]*)</span><span class='name_e'>([^<]*)</span></a> """,
                    member)
                if len(name_find) != 1:
                    output.print_msg("error name_find")
                    continue
                japanese_name, english_name = name_find[0]
                team_plus_find = re.findall(
                    '<div class="team_j">([^<]*)</div>', member)
                team_name = team
                if len(team_plus_find) == 1:
                    if team_plus_find[0].find("兼任") >= 0:
                        team_name = team + " / " + team_plus_find[0].split(
                            "/")[-1].strip().replace("兼任", "")
                japanese_name = japanese_name.replace(" ", "")
                first_name, last_name = english_name.strip().title().split(
                    " ", 1)

                file_handle.write(japanese_name + "\t" + last_name + " " +
                                  first_name + "\t" + team_name + "\n")

Exemplo n.º 40

0

Exibir arquivo

Arquivo: net.py Projeto: yxw19870806/Py3CrawlerLib

def http_request(url, method="GET", fields=None, binary_data=None, header_list=None, cookies_list=None, encode_multipart=False, json_decode=False,
                 is_auto_proxy=True, is_auto_redirect=True, is_gzip=True, is_url_encode=True, is_auto_retry=True, is_random_ip=True,
                 connection_timeout=NET_CONFIG["HTTP_CONNECTION_TIMEOUT"], read_timeout=NET_CONFIG["HTTP_READ_TIMEOUT"]):
    """Http request via urllib3

    :param url:
        the url which you want visit, start with "http://" or "https://"

    :param method:
        request method, value in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]

    :param fields:
        dictionary type of request data, will urlencode() them to string. like post data, query string, etc
        not work with binary_data

    :param binary_data:
        binary type of request data, not work with post_data

    :param header_list:
        customize header dictionary

    :param cookies_list:
        customize cookies dictionary, will replaced header_list["Cookie"]

    :param encode_multipart:
        see "encode_multipart" in urllib3.request_encode_body

    :param is_auto_proxy:
        is auto use proxy when init PROXY_HTTP_CONNECTION_POOL

    :param is_auto_redirect:
        is auto redirect, when response.status in [301, 302, 303, 307, 308]

    :param is_auto_retry:
        is auto retry, when response.status in [500, 502, 503, 504]

    :param connection_timeout:
        customize connection timeout seconds

    :param read_timeout:
        customize read timeout seconds

    :param is_random_ip:
        is counterfeit a request header with random ip, will replaced header_list["X-Forwarded-For"] and header_list["X-Real-Ip"]

    :param json_decode:
        is return a decoded json data when response status = 200
        if decode failure will replace response status with HTTP_RETURN_CODE_JSON_DECODE_ERROR
    """
    url = str(url).strip()
    if not (url.find("http://") == 0 or url.find("https://") == 0):
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    method = method.upper()
    if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]:
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    if HTTP_CONNECTION_POOL is None:
        init_http_connection_pool()
    connection_pool = HTTP_CONNECTION_POOL
    if PROXY_HTTP_CONNECTION_POOL is not None and is_auto_proxy:
        connection_pool = PROXY_HTTP_CONNECTION_POOL
    if is_url_encode:
        url = url_encode(url)

    if header_list is None:
        header_list = {}

    # 设置User-Agent
    if "User-Agent" not in header_list:
        header_list["User-Agent"] = _random_user_agent()

    # 设置一个随机IP
    if is_random_ip:
        random_ip = _random_ip_address()
        header_list["X-Forwarded-For"] = random_ip
        header_list["X-Real-Ip"] = random_ip

    # 设置cookie
    if cookies_list:
        header_list["Cookie"] = build_header_cookie_string(cookies_list)

    # 设置压缩格式
    if is_gzip:
        header_list["Accept-Encoding"] = "gzip"

    # 超时设置
    timeout = urllib3.Timeout(connect=float(connection_timeout) if connection_timeout > 0 else None, read=read_timeout if read_timeout > 0 else None)

    retry_count = 0
    while True:
        thread_event.wait()
        if EXIT_FLAG:
            tool.process_exit(0)

        try:
            if method in ['DELETE', 'GET', 'HEAD', 'OPTIONS']:
                response = connection_pool.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields)
            else:
                if binary_data is None:
                    response = connection_pool.request(method, url, fields=fields, encode_multipart=encode_multipart, headers=header_list,
                                                       redirect=is_auto_redirect, timeout=timeout)
                else:
                    response = connection_pool.request(method, url, body=binary_data, encode_multipart=encode_multipart, headers=header_list,
                                                       redirect=is_auto_redirect, timeout=timeout)
            if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode:
                try:
                    response.json_data = json.loads(response.data.decode())
                except ValueError:
                    is_error = True
                    content_type = response.getheader("Content-Type")
                    if content_type is not None:
                        charset = tool.find_sub_string(content_type, "charset=", None)
                        if charset:
                            if charset == "gb2312":
                                charset = "GBK"
                            try:
                                response.json_data = json.loads(response.data.decode(charset))
                            except:
                                pass
                            else:
                                is_error = False
                    if is_error:
                        response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR
            elif response.status == 429:  # Too Many Requests
                output.print_msg(url + " Too Many Requests, sleep")
                time.sleep(60)
                continue
            elif response.status in [500, 502, 503, 504] and is_auto_retry:  # 服务器临时性错误，重试
                if retry_count < NET_CONFIG["HTTP_REQUEST_RETRY_COUNT"]:
                    retry_count += 1
                    time.sleep(30)
                    continue
                else:
                    return response
            return response
        except MemoryError:
            return ErrorResponse(HTTP_RETURN_CODE_RESPONSE_TO_LARGE)
        except Exception as e:
            message = str(e)
            if isinstance(e, urllib3.exceptions.ConnectTimeoutError):
                # 域名无法解析
                if message.find("[Errno 11004] getaddrinfo failed") >= 0:
                    return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED)
                elif message.find("[Errno 11001] getaddrinfo failed") >= 0:
                    return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED)
            elif isinstance(e, urllib3.exceptions.MaxRetryError):
                if message.find("Caused by ResponseError('too many redirects'") >= 0:
                    return ErrorResponse(HTTP_RETURN_CODE_TOO_MANY_REDIRECTS)
            elif isinstance(e, urllib3.exceptions.DecodeError):
                if message.find("'Received response with content-encoding: gzip, but failed to decode it.'") >= 0:
                    return http_request(url, method=method, fields=fields, binary_data=binary_data, header_list=header_list, cookies_list=cookies_list,
                                        encode_multipart=encode_multipart, json_decode=json_decode, is_auto_proxy=is_auto_proxy, is_auto_redirect=is_auto_redirect,
                                        is_gzip=False, is_url_encode=False, is_auto_retry=is_auto_retry, is_random_ip=is_random_ip,
                                        connection_timeout=connection_timeout, read_timeout=read_timeout)
            # import traceback
            # output.print_msg(message)
            # output.print_msg(traceback.format_exc())
            output.print_msg(url + " 访问超时，重试中")
            time.sleep(5)

        retry_count += 1
        if retry_count >= NET_CONFIG["HTTP_REQUEST_RETRY_COUNT"]:
            output.print_msg("无法访问页面：" + url)
            return ErrorResponse(HTTP_RETURN_CODE_RETRY)

Exemplo n.º 41

0

Exibir arquivo

item_attribute_list = {}
base_host = "http://db.d.163.com"
for item_path, item_position in item_list.items():
    page_count = 1
    item_attribute_list[item_path] = []
    while True:
        if item_position == "傳奇宝石":
            item_index_url = base_host + "/tw/base/legendarygem/"
        else:
            item_index_url = base_host + "/tw/item/%s/legendary.html#page=%s" % (
                item_path, page_count)
        item_index_response = net.http_request(item_index_url, method="GET")
        if item_index_response.status == net.HTTP_RETURN_CODE_SUCCEED:
            # item_index = item_index.decode("UTF-8")
            item_index_page = tool.find_sub_string(item_index_response.data,
                                                   '<div class="cizhui-c-m',
                                                   '<div class="data-options',
                                                   1)
            item_index_page = item_index_page.decode("GBK").encode("UTF-8")
            item_info_list = re.findall('<tr class="[\s|\S]*?</tr>',
                                        item_index_page)
            if len(item_info_list) == 0:
                continue
            for item_info in item_info_list:
                if item_info.find('<em class="transmog-s"></em>') >= 0:
                    continue
                item_url = tool.find_sub_string(item_info, '<a href="', '"')
                item_name = tool.find_sub_string(item_info,
                                                 'class="diablo3tip">', "</a>")
                item_name = item_name.replace("'", "’")
                item_url = base_host + item_url
                item_response = net.http_request(item_url, method="GET")

Exemplo n.º 42

0

Exibir arquivo

Arquivo: net.py Projeto: numbaby/PyGooglePlusCrawler

def http_request(url, method="GET", post_data=None, binary_data=None, header_list=None, cookies_list=None, connection_timeout=HTTP_CONNECTION_TIMEOUT,
                 read_timeout=HTTP_CONNECTION_TIMEOUT, is_random_ip=True, json_decode=False, encode_multipart=False, redirect=True, exception_return=""):
    if not (url.find("http://") == 0 or url.find("https://") == 0):
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    method = method.upper()
    if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]:
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    if HTTP_CONNECTION_POOL is None:
        init_http_connection_pool()

    retry_count = 0
    while True:
        while process.PROCESS_STATUS == process.PROCESS_STATUS_PAUSE:
            time.sleep(10)
        if process.PROCESS_STATUS == process.PROCESS_STATUS_STOP:
            tool.process_exit(0)

        if header_list is None:
            header_list = {}

        # 设置User-Agent
        if "User-Agent" not in header_list:
            header_list["User-Agent"] = _random_user_agent()

        # 设置一个随机IP
        if is_random_ip:
            random_ip = _random_ip_address()
            header_list["X-Forwarded-For"] = random_ip
            header_list["X-Real-Ip"] = random_ip

        # 设置cookie
        if cookies_list:
            header_list["Cookie"] = build_header_cookie_string(cookies_list)

        try:
            if connection_timeout == 0 and read_timeout == 0:
                timeout = None
            elif connection_timeout == 0:
                timeout = urllib3.Timeout(read=read_timeout)
            elif read_timeout == 0:
                timeout = urllib3.Timeout(connect=connection_timeout)
            else:
                timeout = urllib3.Timeout(connect=connection_timeout, read=read_timeout)
            if method == "POST":
                if binary_data is None:
                    response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout, fields=post_data, encode_multipart=encode_multipart)
                else:
                    response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout, body=binary_data, encode_multipart=encode_multipart)
            else:
                response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=redirect, timeout=timeout)
            if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode:
                try:
                    response.json_data = json.loads(response.data)
                except ValueError:
                    is_error = True
                    content_type = response.getheader("Content-Type")
                    if content_type is not None:
                        charset = tool.find_sub_string(content_type, "charset=", None)
                        if charset:
                            if charset == "gb2312":
                                charset = "GBK"
                            try:
                                response.json_data = json.loads(response.data.decode(charset))
                            except:
                                pass
                            else:
                                is_error = False
                    if is_error:
                        response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR
            return response
        except urllib3.exceptions.ProxyError:
            notice = "无法访问代理服务器，请检查代理设置。检查完成后输入(C)ontinue继续程序或者(S)top退出程序："
            input_str = tool.console_input(notice).lower()
            if input_str in ["c", "continue"]:
                pass
            elif input_str in ["s", "stop"]:
                tool.process_exit(0)
        except urllib3.exceptions.ReadTimeoutError:
            pass
        except urllib3.exceptions.ConnectTimeoutError, e:
            # 域名无法解析
            if str(e).find("[Errno 11004] getaddrinfo failed") >= 0:
                return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED)
            pass
        # except urllib3.exceptions.MaxRetryError, e:
        #     print_msg(url)
        #     print_msg(str(e))
        #     # 无限重定向
        #     # if str(e).find("Caused by ResponseError('too many redirects',)") >= 0:
        #     #     return ErrorResponse(-1)
        # except urllib3.exceptions.ConnectTimeoutError, e:
        #     print_msg(str(e))
        #     print_msg(url + " 访问超时，稍后重试")
        #     # 域名无法解析
        #     # if str(e).find("[Errno 11004] getaddrinfo failed") >= 0:
        #     #     return ErrorResponse(-2)
        # except urllib3.exceptions.ProtocolError, e:
        #     print_msg(str(e))
        #     print_msg(url + " 访问超时，稍后重试")
        #     # 链接被终止
        #     # if str(e).find("'Connection aborted.', error(10054,") >= 0:
        #     #     return ErrorResponse(-3)
        except Exception, e:
            if exception_return and str(e).find(exception_return) >= 0:
                return ErrorResponse(HTTP_RETURN_CODE_EXCEPTION_CATCH)
            elif str(e).find("EOF occurred in violation of protocol") >=0:
                time.sleep(30)
            tool.print_msg(str(e))
            tool.print_msg(url + " 访问超时，稍后重试")
            traceback.print_exc()

Exemplo n.º 43

0

Exibir arquivo

Arquivo: net.py Projeto: zhoushuqiang/PyCrawler

def http_request(url, method="GET", fields=None, binary_data=None, header_list=None, cookies_list=None, encode_multipart=False, is_auto_redirect=True,
                 is_auto_retry=True, connection_timeout=HTTP_CONNECTION_TIMEOUT, read_timeout=HTTP_READ_TIMEOUT, is_random_ip=True, json_decode=False):
    """Http request via urllib3

    :param url:
        the url which you want visit, start with "http://" or "https://"

    :param method:
        request method, value in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]

    :param fields:
        dictionary type of request data, will urlencode() them to string. like post data, query string, etc
        not work with binary_data

    :param binary_data:
        binary type of request data, not work with post_data

    :param header_list:
        customize header dictionary

    :param cookies_list:
        customize cookies dictionary, will replaced header_list["Cookie"]

    :param encode_multipart:
        see "encode_multipart" in urllib3.request_encode_body

    :param is_auto_redirect:
        is auto redirect, when response.status in [301, 302, 303, 307, 308]

    :param is_auto_retry:
        is auto retry, when response.status in [500, 502, 503, 504]

    :param connection_timeout:
        customize connection timeout seconds

    :param read_timeout:
        customize read timeout seconds

    :param is_random_ip:
        is counterfeit a request header with random ip, will replaced header_list["X-Forwarded-For"] and header_list["X-Real-Ip"]

    :param json_decode:
        is return a decoded json data when response status = 200
        if decode failure will replace response status with HTTP_RETURN_CODE_JSON_DECODE_ERROR
    """
    if not (url.find("http://") == 0 or url.find("https://") == 0):
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    method = method.upper()
    if method not in ["GET", "POST", "HEAD", "PUT", "DELETE", "OPTIONS", "TRACE"]:
        return ErrorResponse(HTTP_RETURN_CODE_URL_INVALID)
    if HTTP_CONNECTION_POOL is None:
        init_http_connection_pool()

    if header_list is None:
        header_list = {}

    # 设置User-Agent
    if "User-Agent" not in header_list:
        header_list["User-Agent"] = _random_user_agent()

    # 设置一个随机IP
    if is_random_ip:
        random_ip = _random_ip_address()
        header_list["X-Forwarded-For"] = random_ip
        header_list["X-Real-Ip"] = random_ip

    # 设置cookie
    if cookies_list:
        header_list["Cookie"] = build_header_cookie_string(cookies_list)

    # 超时设置
    if connection_timeout == 0 and read_timeout == 0:
        timeout = None
    elif connection_timeout == 0:
        timeout = urllib3.Timeout(read=read_timeout)
    elif read_timeout == 0:
        timeout = urllib3.Timeout(connect=connection_timeout)
    else:
        timeout = urllib3.Timeout(connect=connection_timeout, read=read_timeout)

    retry_count = 0
    while True:
        if process.PROCESS_STATUS == process.PROCESS_STATUS_STOP:
            tool.process_exit(0)
        thread_event.wait()

        try:
            if method in ['DELETE', 'GET', 'HEAD', 'OPTIONS']:
                response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields)
            else:
                if binary_data is None:
                    response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, fields=fields, encode_multipart=encode_multipart)
                else:
                    response = HTTP_CONNECTION_POOL.request(method, url, headers=header_list, redirect=is_auto_redirect, timeout=timeout, body=binary_data, encode_multipart=encode_multipart)
            if response.status == HTTP_RETURN_CODE_SUCCEED and json_decode:
                try:
                    response.json_data = json.loads(response.data)
                except ValueError:
                    is_error = True
                    content_type = response.getheader("Content-Type")
                    if content_type is not None:
                        charset = tool.find_sub_string(content_type, "charset=", None)
                        if charset:
                            if charset == "gb2312":
                                charset = "GBK"
                            try:
                                response.json_data = json.loads(response.data.decode(charset))
                            except:
                                pass
                            else:
                                is_error = False
                    if is_error:
                        response.status = HTTP_RETURN_CODE_JSON_DECODE_ERROR
            elif response.status in [500, 502, 503, 504] and is_auto_retry:  # 服务器临时性错误，重试
                if retry_count < HTTP_REQUEST_RETRY_COUNT:
                    retry_count += 1
                    time.sleep(30)
                    continue
                else:
                    return response
            return response
        except urllib3.exceptions.ConnectTimeoutError, e:
            # 域名无法解析
            if str(e).find("[Errno 11004] getaddrinfo failed") >= 0:
                return ErrorResponse(HTTP_RETURN_CODE_DOMAIN_NOT_RESOLVED)
            pass
        except MemoryError:
            return ErrorResponse(HTTP_RETURN_CODE_RESPONSE_TO_LARGE)