예제 #1
0
def _get_comment_svg(url,svg=None):
    if svg is None:
        resp = send_http( requests.Session(),
                          'get',
                          url,
                          retries=-1,
                          headers=CSS_HEADERS
                          )
        svg = resp[0].text
    if svg:
        res = {}
        text = bs(svg,'lxml')
        text_path = text('textpath')
        if not text_path:
            texts = text('text')
            ys = {i['y']: i.text for i in texts if i}
            return ys,svg
        else:
            path = text('path')
            for _,i in enumerate(path):
                d = i['d']
                num = int(d.split(' ')[1].strip())
                string = text_path[_].text.strip()
                res[num]=string
            return res,svg
예제 #2
0
파일: city.py 프로젝트: LXF-DX3906/DPspider
 def fetch_map_page(self, data):
     result = send_http(self.session,
                        'post',
                        API_MAP_SEARCH,
                        retries=MAX_RETRY,
                        proxy=self.proxy,
                        headers=self.map_headers,
                        timeout=TIMEOUT,
                        data=data,
                        kind='JSON')
     if result:
         response, self.proxy, self.map_headers = result
         page_data = response.json()
         return page_data
예제 #3
0
	def get(self,headers=HEADERS,proxy=None):
		result = send_http(self.session,
						 'get',
						 self.url,
						 retries=MAX_RETRY,
						 headers=headers,
						 proxy=proxy,
						 timeout=TIMEOUT,
						 kind='SHOP',
						 )
		if result:
			response, self.proxy, self.headers = result
			self.homepage = response.text
			self._fetched = True
			logger.info(f'成功获取店铺:{self.id} 首页.')
예제 #4
0
 def get_map(self,headers=HEADERS,proxy=None):
     url = API_CITY_MAP.format(id=self.id)
     result = send_http(self.session,
                        'get',
                        url,
                        retries=MAX_RETRY,
                        headers=headers,
                        proxy=proxy,
                        timeout=TIMEOUT,
                        kind='MAP',
                        )
     if result:
         response, self.proxy, self.headers = result
         self.map_page = response.text
         logger.info(f'获取 “{self.city}” 地图搜索页成功')
예제 #5
0
파일: comment.py 프로젝트: mwsssxu/DPspider
 def get(self, url=None, headers=LOGIN_HEADERS, proxy=None):
     _url = url if url else self.home_url
     result = send_http(self.session,
                        'get',
                        _url,
                        retries=MAX_RETRY,
                        headers=headers,
                        proxy=proxy,
                        timeout=TIMEOUT,
                        kind='SHOP')
     if result:
         response, self.proxy, self.headers = result
         self.homepage = response.text
         logger.info(f'成功获取店铺:{self.id} 点评相关页.')
     else:
         self.homepage = None
예제 #6
0
	def get_shop_css(self, reget=False):
		src = from_pattern(PATTERN_CSS,self.homepage)
		if src:
			url = '//'.join([CSS_URL_PREFIX,src])
			result = send_http(self.session,
							 'get',
							 url,
							 retries=MAX_RETRY,
							 headers=self.css_headers,
							 proxy=self.css_proxy,
							 timeout=TIMEOUT,
							 kind='CSS',
							 )
			if result:
				response, self.css_proxy, self.css_headers = result
				self.css = response.text
				return self.css
예제 #7
0
 def fetch_map_page(self, data):
     self.map_headers['Referer'] = SEARCH_MAP_POST_REFERER \
         .format(data["cityId"], data["regionId"], quote(data["keyword"]))
     result = send_http(self.session,
                        'post',
                        API_MAP_SEARCH,
                        retries=MAX_RETRY,
                        proxy=self.proxy,
                        headers=self.map_headers,
                        timeout=TIMEOUT,
                        data=data,
                        kind='JSON'
                        )
     if result:
         response, self.proxy, self.map_headers = result
         page_data = response.json()
         return page_data
예제 #8
0
파일: city.py 프로젝트: LXF-DX3906/DPspider
 def get_hot(self):
     """
     获取当前城市的搜索热度前十关键词
     :return:[{'子标签': '8', '索引': '0', '主分类id': '', '数据类型': '3000', 'id_': '587192', '关键词': '三里屯'},..]
     """
     url = API_CITY_HOT.format(id=self.id)
     result = send_http(self.session,
                        'get',
                        url,
                        headers=self.headers,
                        retries=MAX_RETRY,
                        kind='JSON',
                        proxy=self.proxy)
     if result:
         response, self.proxy, _ = result
         data = response.json()
         self._hot = [i['valueMap'] for i in data['recordList']]
         return self._hot
예제 #9
0
 def start_request(self):
     headers = SHOP_INFO_HEADERS
     headers['Referer'] = "http://www.dianping.com/shop/{}".format(
         self.shopId)
     result = send_http(
         self.session,
         'get',
         self.url,
         retries=MAX_RETRY,
         headers=headers,
         timeout=TIMEOUT,
         _token=self.token.new(),
         kind='SHOP',
     )
     if result:
         response, _, _ = result
         self.homepage = response.json()
         self._fetched = True
         logger.info(f'成功获取店铺:{self.shopId} 详情.')
예제 #10
0
 def get(self,headers=HEADERS,proxy=None):
     """
     抓取当前城市首页
     :param proxy:使用的代理
     :param headers:伪造的请求头部
     """
     result = send_http(self.session,
                        'get',
                        self.url,
                        retries=MAX_RETRY,
                        headers=headers,
                        proxy=proxy,
                        timeout=TIMEOUT,
                        kind='CITY',
                        )
     if result:
         response, self.proxy, self.headers = result
         self.homepage = response.text
         logger.info(f'获取 “{self.city}” 首页成功.')
예제 #11
0
 def get_relative(self, keyword):
     """
     返回关键词相关的搜索结果和结果数
     :param keyword:关键词
     :return:{相关结果:数量,..}
     """
     url = API_KEY_RELATIVE.format(id=self.id, key=keyword)
     result = send_http(self.session,
                        'get',
                        url,
                        headers=self.headers,
                        retries=MAX_RETRY,
                        kind='JSON',
                        proxy=self.proxy)
     if result:
         response, self.proxy, _ = result
         data = response.json()
         res = {i.split('|')[0]:from_pattern(PATTERN_NUMS, i.split('|')[-2])
                for i in data['msg']['shop']}
         return res
예제 #12
0
def _get_num_svg(url):
    resp = send_http(requests.Session(),
                     'get',
                     url,
                     retries=-1,
                     headers=CSS_HEADERS)
    if resp:
        text = bs(resp[0].text, 'lxml')
        texts = text('text')
        if not texts:
            res = {}
            text_path = text('textpath')
            path = text('path')
            for _, i in enumerate(path):
                d = i['d']
                num = int(d.split(' ')[1].strip())
                string = text_path[_].text.strip()
                res[num] = string
            return res
        else:
            ys = {i['y']: i.text for i in texts if i}
            return ys
예제 #13
0
def get_city_list(url,headers=HEADERS,proxy=None):
    result = send_http(requests.session(),
                       'get',
                       url,
                       retries=-1,
                       proxy=proxy,
                       headers=headers,
                       timeout=TIMEOUT,
                       kind='CITY_LIST',
                       )
    if result:
        text = result[0].text
        ul = get_sub_tag(text,'city_list')
        if ul:
            with open(CITY_LIST_FILE_PATH,'w') as f:
                res = {}
                lis = ul('li')
                for li in lis:
                    _as = li('a')
                    for a in _as:
                        res[a.text] = CITY_URL_PREFIX+a['href']
                f.write(json.dumps(res))
            return res