Пример #1
0
def get_data(title, scope, page_size=20, page_num=0):
    """获取数据

    :param title: 主题
    :param scope: 范围
    :param page_size: 返回一条是数据,默认是20,最大是20
    :param page_num: 页数
    :return: 主题信息集合
    """
    global _total_data
    ak = choice(AK)  # 随机选取一个Ak值.AK要在百度接口上获取
    address = f'http://api.map.baidu.com/place/v2/search?query={title}&region={scope}&output=json&ak={ak}&page_size=' \
        f'{page_size}&page_num={page_num}'
    r = requests.get(address, headers={'User-Agent': random()})
    json = r.json()
    status = json.get('status')
    if status == 401:
        get_data(title, scope, page_num=page_num)
        return None
    elif str(status).startswith('3'):
        raise BaiDuMapError('该模块已经废弃不可用')
    print(json)
    results = json['results']  # 获取主要信息
    total = json['total']  # 获取总条数
    page = ceil(total / page_size)  # 取上整,获取页数
    current_page = page_num + 1  # 当前页数,应该等于上一页加1
    for result in results:
        name = result['name']
        locations = result['location']
        location = str(locations['lng']) + '|' + str(locations['lat'])
        address = result.get('province') + result.get('city') + result.get('area') + result.get('address')
        _total_data.append(Data(name, location, address))
    if current_page < page:
        get_data(title, scope, page_num=current_page)
Пример #2
0
def movie(name):
    param = dict(searchword=name.encode('gb2312'))
    response = requests.post(url + 'search.asp', data=param, headers={'User-Agent': random()})
    response.encoding = 'GBK'
    data = response.text
    find, = re.findall(pattern=r'<div class="list mb">(.+)</div>', string=data, flags=re.S)
    message = re.findall(pattern='a href="(.+)" title="(.+)" class', string=find)
    return message
Пример #3
0
def Load_BaiDuBaiKe(name):
    """下载百度百科里面的内容信息
    :param name: 百科百科名字
    :return: 百度百科的文本信息
    """
    url = F'https://baike.baidu.com/item/{quote(name)}'
    response = requests.get(url, headers={'User-Agent': random()})
    data = response.content.decode('utf-8')
    return data
Пример #4
0
 def __init__(self, url, my_app_id, my_app_secret):
     if not os.path.exists(rec_tmp_dir):
         os.mkdir(rec_tmp_dir)
     self.url = url
     self.my_appId = my_app_id
     self.my_appSecret = my_app_secret
     body["showapi_appid"] = my_app_id
     body["showapi_sign"] = my_app_secret
     headers["User-Agent"] = random()
Пример #5
0
 def get_7day_weather(self):
     """获得7天 天气预报"""
     total = []
     for code in self.codes:
         url = F'http://www.weather.com.cn/weather/{code}.shtml'
         response = requests.get(url, headers={'User-Agent': random()})
         response.encoding = 'utf-8'
         gd = _GetData()
         gd.feed(response.text)
         total.append({gd.addr: gd.data_7d})
     return total
Пример #6
0
def fetch_gitee(package, name, project='logo'):
    """爬取非结构文本数据

    :param package: 包名
    :param name: 文件名
    :param project: 项目名
    :return: 返回字节数据
    """
    url = GITEE.format(project=project, package=package, name=name)
    response = requests.get(url=url, headers={'User-Agent': random()})  # 动态增加UA
    content = response.content
    return content
Пример #7
0
 def get_15day_weather(self):
     """获得15天 天气预报"""
     t_7d = self.get_7day_weather()
     for code in self.codes:
         url = F'http://www.weather.com.cn/weather15d/{code}.shtml'
         response = requests.get(url, headers={'User-Agent': random()})
         response.encoding = 'utf-8'
         gd = _GetData()
         gd.feed(response.text)
         for t in t_7d:
             if gd.addr in t:
                 t[gd.addr].extend(gd.data_7d)
     return t_7d
Пример #8
0
 def get_today_weather(self):
     """获得当天 天气预报"""
     total = []
     for code in self.codes:
         url = F'http://www.weather.com.cn/weather1d/{code}.shtml'
         response = requests.get(url, headers={'User-Agent': random()})
         response.encoding = 'utf-8'
         data = response.text
         d = re.findall(
             r'<input type="hidden" id="hidden_title" value="(.+)" />',
             data)
         gd = _GetData()
         gd.feed(data)
         total.append({gd.addr: d})
     return total
Пример #9
0
def fetch_gitee(package, name, project='logo'):
    """
    爬取非结构文本数据
    :param package: 包名
    :param name: 文件名
    :param project: 项目名
    :return: 返回字节数据
    """
    url = GITEE.format(project=project, package=package, name=name)
    response = requests.get(url=url, headers={'User-Agent':
                                              random()})  # 动态增加UA
    content = response.content
    # if pil:
    #     byte = io.BytesIO(content)  # 转成字节流
    #     return Image.open(fp=byte)
    return content
Пример #10
0
    def __init__(self, search):
        """初始化,也就是模糊搜索的第一步

        :param search: 关键字
        """
        self.header = {'User-Agent': random()}  # 设置UA
        # 将中文字转化为URL链接。注意搜狗将中文字进行的GBK编码。而不是UTF-8
        url_word = quote(search.encode('GBK'))
        url = 'https://pinyin.sogou.com/dict/search/search_list/%s/normal/' % url_word  # 搜索链接
        response = requests.get(url=url, headers=self.header)
        match = re.findall(url[24:] + '(.{1,3})">', response.text)  # 匹配下载页数
        max_page = max(map(lambda x: int(x),
                           match)) if match else 1  # 选取最大的页数,如果没有页数返回1
        m = []  # 将匹配到下载链接
        for page in range(1, max_page + 1):
            response = requests.get(url=url + str(page), headers=self.header)
            match = re.findall(r'id=(.+)&name=(.+)"', response.text)  # 匹配下载链接
            m.extend(match)  # 将匹配到的下载链接装到链表中
        load_url = 'https://pinyin.sogou.com/d/dict/download_cell.php?id={0}&name={1}'  # 下载链接的格式
        # 将匹配到的,名字和ID映射到下载链接格式中
        self.load_url = map(lambda x: load_url.format(x[0], x[1]), m)