Exemplo n.º 1
0
def try_craw_info(fund_code, try_cnt):
    """
    @param fund_code:
    @return: 
    """
    if try_cnt > 5:
        return None, None
    try:
        '''爬取页面,获得该基金的详细数据'''
        position_title_url = "http://fundf10.eastmoney.com/ccmx_" + str(fund_code[1:]) + ".html"
        print('第 {0} 次尝试,正在爬取基金 {1} 的详细数据中...'.format(try_cnt, fund_code[1:]))
        response_title = requests.get(url=position_title_url, headers={'User-Agent': get_ua()}, timeout=10)

        """爬取页面,获取该基金的持仓数据"""
        position_data_url = "http://fundf10.eastmoney.com/FundArchivesDatas.aspx?type=jjcc&code=" + \
                            str(fund_code[1:]) + "&topline=10&year=&month=&rt=" + str(random.uniform(0, 1))
        print('第 {0} 次尝试,正在爬取基金 {1} 的持仓情况中...'.format(try_cnt, fund_code[1:]))
        # 解析基金的持仓情况
        response_data = requests.get(url=position_data_url, headers={'User-Agent': get_ua()}, timeout=10)

        # 解析基金的详细数据
        rank_detail_info = resolve_rank_detail_info(fund_code[1:], response_title)
        fund_positions_data = resolve_position_info(fund_code[1:], response_data.text)
        time.sleep(random.randint(2, 4))
    except:
        time.sleep(random.randint(2*try_cnt, 4*try_cnt))
        print("{0} 基金数据爬取失败,请注意!".format(str(fund_code[1:])))
        rank_detail_info, fund_positions_data = try_craw_info(fund_code, try_cnt+1)

    return rank_detail_info, fund_positions_data
Exemplo n.º 2
0
def get_rank_data(url, page_index, max_page, fund_type):
    """
    根据起始页码获取当前页面的所有基金情况
    :return:
    """
    try_cnt = 0
    rank_data = []
    # 若当前页其实页码小于总页数 或者 超时3次 则退出
    while page_index<max_page and try_cnt<3:
        # 根据每页数据条数确定起始下标
        new_url = url + '?ft=' + fund_type + '&sc=1n&st=desc&pi=' + str(page_index) + '&pn=100&fl=0&isab=1'
        print('正在爬取第 {0} 页数据:{1}'.format(page_index, new_url))
        # 爬取当前页码的数据
        response = requests.get(url=new_url, headers={'User-Agent': get_ua()}, timeout=10)
        if len(response.text) > 100:
            # 匹配数据并解析
            res_data = re.findall("\[{1}\S+\]{1}", response.text)[0]
            # 解析单页数据
            rank_pages_data = resolve_rank_info(res_data)
            rank_data.extend(rank_page_data for rank_page_data in rank_pages_data)
        else:
            try_cnt += 1
        page_index += 1

        # 随机休眠3-5 秒
        time.sleep(random.randint(3, 5))

    df_rank_data = pd.DataFrame(rank_data)
    return df_rank_data
Exemplo n.º 3
0
def get_metro_info(id, cityname, name):
    """
    地铁线路信息获取
    """
    url = "http://map.amap.com/service/subway?_1618387860087&srhdata=" + id + '_drw_' + cityname + '.json'
    res = requests.get(url, headers={'User-Agent': get_ua()})
    data = json.loads(res.text)

    df_data_city = pd.DataFrame()
    if data['l']:
        # 遍历每一条地铁线路
        for data_line in data['l']:
            df_per_zd = pd.DataFrame(data_line['st'])
            df_per_zd = df_per_zd[['n', 'sl', 'poiid', 'sp']]
            df_per_zd['gd经度'] = df_per_zd['sl'].apply(
                lambda x: x.split(',')[0])
            df_per_zd['gd纬度'] = df_per_zd['sl'].apply(
                lambda x: x.split(',')[1])
            df_per_zd.drop('sl', axis=1, inplace=True)
            df_per_zd['路线名称'] = data_line['ln']
            df_per_zd['城市名称'] = name

            df_per_zd.rename(columns={
                'n': '站点名称',
                'sp': '拼音名称',
                'poiid': 'POI编号'
            },
                             inplace=True)
            df_data_city = df_data_city.append(df_per_zd, ignore_index=True)

    return df_data_city
Exemplo n.º 4
0
def get_city_list():
    """
    获取拥有地铁的所有城市
    @return:
    """
    url = 'http://map.amap.com/subway/index.html'
    res = requests.get(url, headers={'User-Agent': get_ua()})
    res.encoding = res.apparent_encoding
    soup = BeautifulSoup(res.text, 'html.parser')

    name_dict = []
    # 获取显示出的城市列表
    for soup_a in soup.find('div', class_='city-list fl').find_all('a'):
        city_name_py = soup_a['cityname']
        city_id = soup_a['id']
        city_name_ch = soup_a.get_text()
        name_dict.append({
            'name_py': city_name_py,
            'id': city_id,
            'name_ch': city_name_ch
        })
    # 获取未显示出来的城市列表
    for soup_a in soup.find('div', class_='more-city-list').find_all('a'):
        city_name_py = soup_a['cityname']
        city_id = soup_a['id']
        city_name_ch = soup_a.get_text()
        name_dict.append({
            'name_py': city_name_py,
            'id': city_id,
            'name_ch': city_name_ch
        })

    df_name = pd.DataFrame(name_dict)

    return df_name
def get_question_base_info(url):
    """
    获取问题的详细描述
    @param url:
    @return:
    """
    response = requests.get(url=url,
                            headers={'User-Agent': get_ua()},
                            timeout=10)
    # print(response.text.replace('\u200b', '').replace('\u2022', ''))
    """获取数据并解析"""
    soup = BeautifulSoup(response.text, 'lxml')
    # 问题标题
    title = soup.find("h1", {"class": "QuestionHeader-title"}).text
    # 具体问题
    question = ''
    try:
        question = soup.find("div", {
            "class": "QuestionRichText--collapsed"
        }).text.replace('\u200b', '')
    except Exception as e:
        print(e)
    # 关注者
    follower = int(
        soup.find_all("strong", {"class": "NumberBoard-itemValue"
                                 })[0].text.strip().replace(",", ""))
    # 被浏览
    watched = int(
        soup.find_all("strong", {"class": "NumberBoard-itemValue"
                                 })[1].text.strip().replace(",", ""))
    # 问题回答次数
    answer_str = soup.find_all(
        "h4", {"class": "List-headerText"})[0].span.text.strip()
    # 抽取xxx 个回答中的数字:【正则】数字出现次数>=0
    answer_count = int(re.findall('\d*', answer_str)[0])

    # 问题标签
    tag_list = []
    tags = soup.find_all("div", {"class": "QuestionTopic"})
    for tag in tags:
        tag_list.append(tag.text)

    return title, question, follower, watched, answer_count, tag_list
Exemplo n.º 6
0
def get_position_data(data, rank):
    """
    根据起始页码获取当前页面的所有Top数据
    @param data:
    @param rank:
    @return:
    """
    """筛选Top数据"""
    data = data.replace('', np.NaN, regex=True)
    data_notna = data.dropna(subset=['近1年'])
    data_notna['近1年'] = data_notna['近1年'].astype(float)
    data_sort = data_notna.sort_values(by='近1年', ascending=False)
    data_rank = data_sort.loc[0:rank-1, :]

    # 爬取每个基金的数据
    rank_detail_data = []
    position_data = []
    error_funds_list = []
    for row_index, data_row in data_rank.iterrows():
        fund_code = str(data_row['基金代码'])
        try:
            '''爬取页面,获得该基金的详细数据'''
            position_title_url = "http://fundf10.eastmoney.com/ccmx_" + str(fund_code[1:]) + ".html"
            print('正在爬取第 {0}/{1} 个基金 {2} 的详细数据中...'.format(row_index+1, len(data_rank), fund_code[1:]))
            response_title = requests.get(url=position_title_url, headers={'User-Agent': get_ua()}, timeout=10)
            # 解析基金的详细数据
            rank_detail_info = resolve_rank_detail_info(fund_code[1:], response_title)

            """爬取页面,获取该基金的持仓数据"""
            position_data_url = "http://fundf10.eastmoney.com/FundArchivesDatas.aspx?type=jjcc&code=" + \
                                str(fund_code[1:]) + "&topline=10&year=&month=&rt=" + str(random.uniform(0, 1))
            print('正在爬取第 {0}/{1} 个基金 {2} 的持仓情况中...'.format(row_index + 1, len(data_rank), fund_code[1:]))
            # 解析基金的持仓情况
            response_data = requests.get(url=position_data_url, headers={'User-Agent': get_ua()}, timeout=10)
            fund_positions_data = resolve_position_info(fund_code[1:], response_data.text)

            # 保存数据
            rank_detail_data.append(rank_detail_info)
            position_data.extend(fund_position_data for fund_position_data in fund_positions_data)
        except:
            error_funds_list.append(fund_code)
            print("{0} 数据爬取失败,稍后会进行重试,请注意!".format(str(fund_code[1:])))
        # 随机休眠2-4 秒
        time.sleep(random.randint(2, 4))

    """爬取失败的进行重试"""
    for fund_info in error_funds_list:
        rank_detail_data_try, position_data_try = try_craw_info(fund_info, 1)
        if rank_detail_data_try == '':
            # 保存数据
            rank_detail_data.append(rank_detail_data_try)
            position_data.extend(fund_position_data for fund_position_data in position_data_try)

    df_rank_detail_data = pd.DataFrame(rank_detail_data)
    df_position_data = pd.DataFrame(position_data)

    return df_rank_detail_data, df_position_data
def get_answer_info(url, index):
    """
    解析问题回答
    @param url:
    @param index:
    @return:
    """
    response = requests.get(url=url,
                            headers={'User-Agent': get_ua()},
                            timeout=10)
    text = response.text.replace('\u200b', '')

    per_answer_list = []
    try:
        question_json = json.loads(text)
        """获取当前页的回答数据"""
        print("爬取第{0}页回答列表,当前页获取到{1}个回答".format(index + 1,
                                                len(question_json["data"])))
        for data in question_json["data"]:
            """问题的相关信息"""
            # 问题的问题类型、id、提问类型、创建时间、修改时间
            question_type = data["question"]['type']
            question_id = data["question"]['id']
            question_question_type = data["question"]['question_type']
            question_created = get_time_str(data["question"]['created'])
            question_updated_time = get_time_str(
                data["question"]['updated_time'])
            """答主的相关信息"""
            # 答主的用户名、签名、性别、粉丝数
            author_name = data["author"]['name']
            author_headline = data["author"]['headline']
            author_gender = data["author"]['gender']
            author_follower_count = data["author"]['follower_count']
            """回答的相关信息"""
            # 问题回答id、创建时间、更新时间、赞同数、评论数、具体内容
            id = data['id']
            created_time = get_time_str(data["created_time"])
            updated_time = get_time_str(data["updated_time"])
            voteup_count = data["voteup_count"]
            comment_count = data["comment_count"]
            content = data["content"]

            per_answer_list.append([
                question_type, question_id, question_question_type,
                question_created, question_updated_time, author_name,
                author_headline, author_gender, author_follower_count, id,
                created_time, updated_time, voteup_count, comment_count,
                content
            ])

    except:
        print("Json格式校验错误")
    finally:
        answer_column = [
            '问题类型', '问题id', '问题提问类型', '问题创建时间', '问题更新时间', '答主用户名', '答主签名',
            '答主性别', '答主粉丝数', '答案id', '答案创建时间', '答案更新时间', '答案赞同数', '答案评论数',
            '答案具体内容'
        ]
        per_answer_data = pd.DataFrame(per_answer_list, columns=answer_column)

    return per_answer_data