def http_emitter(message, logger, agentConfig): logger.debug('http_emitter: start') # Post back the data postBackData = format_body(message) logger.debug('http_emitter: attempting postback to ' + agentConfig['dd_url']) # Build the request handler apiKey = message.get('apiKey', None) if not apiKey: raise Exception("The http emitter requires an api key") url = "%s/intake?api_key=%s" % (agentConfig['dd_url'], apiKey) headers = post_headers(agentConfig, postBackData) proxy_settings = get_proxy(agentConfig) urllib2 = get_http_library(proxy_settings, agentConfig['use_forwarder']) try: request = urllib2.Request(url, postBackData, headers) # Do the request, logger any errors opener = get_opener(logger, proxy_settings, agentConfig['use_forwarder'], urllib2) if opener is not None: urllib2.install_opener(opener) response = urllib2.urlopen(request) try: logger.debug('http_emitter: postback response: ' + str(response.read())) finally: response.close() except urllib2.HTTPError, e: if e.code == 202: logger.debug("http payload accepted") else: raise
def get_sub_area_logic(each): data = [] url = 'http://you.ctrip.com/restaurantlist/%s.html' % each[-1] retry = 5 html = '' while retry > 0: try: html = requests.get(url, headers=headers, proxies=config.get_proxy()).content.decode( config.ENCODING) break except: pass retry -= 1 selector = etree.HTML(html) cons = selector.xpath('//div[@id="locationDiv"]/p/a') for i in cons: info = each.copy() area = i.xpath('text()')[0] if i.xpath('text()') else '' co = i.xpath('@onclick')[0] if i.xpath('@onclick') else '' code = re.findall('OnRegion\((.*?)\)', co)[0] info[5] = area info.append(code) data.append(info) return data
def get_cars(self, start_year, end_year, maker, model, seller_type, condition, keywords=''): links = [] item_on_page = 1000 url = f'{self.base_url}/{maker.lower().replace(" ","-")}/{model.lower().replace(" ","-")}/on/milton/' payload = { 'rcp': f'{item_on_page}', 'rsc': str(0), #page number 'srt': str(3), 'yRng': f'{start_year},{end_year}', 'prx': f'{self.search_radius}', 'prv': 'Ontario', 'loc': f'{self.post_code}', 'hprc': True, 'wcp': True, 'sts': f'{condition}', 'adtype': f'{seller_type}', 'showcpo': str(1), 'inMarket': 'advancedSearchs' } if keywords != '': payload['kwd'] = keywords session = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=20) session.mount('https://', adapter) session.mount('http://', adapter) session.proxies.update(get_proxy()) response = session.get(url, headers=HEADERS, params=payload) logging.debug('response status code: {}'.format(response.status_code)) logging.debug("response url: {}".format(response.url)) logging.debug("\n Maker:{} \n Model:{} \n Keyword: {}".format( maker, model, keywords)) soup = bs4.BeautifulSoup(response.text, 'html.parser') cars_count = soup.find('div', { 'class': 'results-count-wrapper' }).find('span', { 'id': 'sbCount' }).text if int(cars_count) != 0: logging.debug('cars url: {}'.format(response.url)) logging.debug('cars was find: {}'.format(cars_count)) page_count = math.ceil(int(cars_count) / 15) j_data = soup.find('div', { 'class': 'col-xs-12 disable-on-search' }).find('script', {'type': 'application/ld+json'}) json_data = json.loads( str(j_data).replace('</script>', '').replace( '<script type="application/ld+json">', '').strip()) for data in json_data['offers']['offers']: links.append('https://www.autotrader.ca' + data['url']) return links
def __init__(self, event=None): self.event = event self._session = Session() self.github_proxy = get_proxy() try: self._session.headers.update( {"Authorization": f"token {get_token()}"}) except NoGitHubTokenException: self._session.auth = get_credentials() self._session.headers.update({"User-Agent": APP_NAME}) if event is not None: self.issue_url = event['pull_request']['issue_url']
def kj_scrape_models(): models_dict = {} session = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=20) session.mount('https://', adapter) session.mount('http://', adapter) session.proxies.update(get_proxy()) # url = 'https://www.kijijiautos.ca/cars/audi/#ms=1900&od=down&sb=relv3' for key, val in kj_get_makers().items(): models_dict[key] = [] url = 'https://www.kijijiautos.ca/cars/{}/#ms={}&od=down&sb=relv3'.format( key.lower().replace(' ', '-'), val) response = session.get(url, proxies=get_proxy()) if response.status_code == 503: session = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=20) session.mount('https://', adapter) session.mount('http://', adapter) session.proxies.update(get_proxy()) # print('scraper was block') # print('waiting for response after 503') time.sleep(30) response = session.get(url, proxies=get_proxy()) soup = bs4.BeautifulSoup(response.text, 'html.parser') models_block = soup.find_all('div', {'class': 'b3Ood7 dpzS6u'}) try: models = models_block[1].find('select').find_all('option') for opt in models: model_name = opt.text model_id = opt.get('value') # print(key, model_name, model_id) models_dict[key].append({model_name: model_id}) except IndexError as err: print("blank request to Other items") with open('kj_models.yaml', 'w') as outfile: yaml.dump(models_dict, outfile, default_flow_style=False)
def kj_scrape_makers(): url = 'https://www.kijijiautos.ca/' makers_dict = {} response = requests.session().get(url, headers=KJ_HEADERS, proxies=get_proxy()) soup = bs4.BeautifulSoup(response.text, 'html.parser') makers_block = soup.find('div', {'class': 'bpzS6u'}) makers_id = makers_block.find_all('input') for shildik in makers_id: name = shildik.get('name') _id = shildik.get('id').split(' ')[1].replace('makes-', '') makers_dict[name] = _id with open('kj_makers.yaml', 'w') as outfile: yaml.dump(makers_dict, outfile, default_flow_style=False)
def get_cars(self, start_year, end_year, maker, model, seller_type, condition, keywords=''): links = [] if seller_type == 'Private': transform_seller_type = 'FSBO' elif seller_type == 'Diller': transform_seller_type = 'DILLER' else: transform_seller_type = '' model_qs = kj_get_models(maker, model) url = 'https://www.kijijiautos.ca/consumer/srp/by-params' payload = { 'sb': 'relv3', 'od': 'down', 'ms': model_qs, 'yc': f'{start_year}:{end_year}', 'st': transform_seller_type, 'ps': '0', 'psz': '500', 'vc': 'Car', # 'con': f'{str(condition).upper()}', 'll': '43.52318260000001,-79.8547073', 'rd': self.search_radius } if keywords != '': payload['q'] = keywords session = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=20) session.mount('https://', adapter) session.mount('http://', adapter) session.proxies.update(get_proxy()) resp = session.get(url, headers=KJ_HEADERS, params=payload) logging.debug('response status code: {}'.format(resp.status_code)) logging.debug("response url: {}".format(resp.url)) logging.debug("\n Maker:{} \n Model:{} \n Keyword: {}".format(maker, model, keywords)) j_data = json.loads(resp.text) for i in j_data['listings']['items']: link = f'{self.base_url}/{maker.lower().replace(" ","-")}/{model.lower().replace(" ","-")}/{condition.lower()}/#vip={i["id"]}' links.append(link) return links
def get_city_list(): prov_list = [ i.strip().split('\u0001') for i in open(config.PROVS_LIST, 'r', encoding='utf8') ] all_citys = [ i.strip().split(config.BLANK) for i in open(config.ALL_CITY_LIST, 'r', encoding=config.ENCODING) ] data = [] for each in prov_list: print(each) url = 'http://you.ctrip.com/sitemap/place/c%s' % each[2] html = requests.get(url, headers=headers, proxies=config.get_proxy()).content.decode('utf8') selector = etree.HTML(html) cons = selector.xpath('//div[@class="sitemap_block"]/ul[1]/li') for i in cons: city = i.xpath('a/text()')[0].replace('旅游攻略', '') url = i.xpath('a/@href')[0].split('/')[-1].replace('.html', '') data.append([each[0], city, url]) text = '' for i in data: for t in all_citys: if i[0] in t and i[1] in t: count = t.index(i[1]) if count == 2 or count == 3: t[4], t[5] = '', '' t[-1] = str(re.findall('\d\d\d\d', t[-1])[0]) + '00' text += config.BLANK.join(t) + config.BLANK + i[-1] + '\n' else: text += config.BLANK.join(t) + config.BLANK + i[-1] + '\n' break with open(config.CITY_LIST, 'a', encoding=config.ENCODING) as f: f.write(text)