Пример #1
0
def http_emitter(message, logger, agentConfig):
    logger.debug('http_emitter: start')

    # Post back the data
    postBackData = format_body(message)

    logger.debug('http_emitter: attempting postback to ' + agentConfig['dd_url'])

    # Build the request handler
    apiKey = message.get('apiKey', None)
    if not apiKey:
        raise Exception("The http emitter requires an api key")

    url = "%s/intake?api_key=%s" % (agentConfig['dd_url'], apiKey)
    headers = post_headers(agentConfig, postBackData)

    proxy_settings = get_proxy(agentConfig)
    urllib2 = get_http_library(proxy_settings, agentConfig['use_forwarder'])

    try:
        request = urllib2.Request(url, postBackData, headers)
        # Do the request, logger any errors
        opener = get_opener(logger, proxy_settings, agentConfig['use_forwarder'], urllib2)
        if opener is not None:
            urllib2.install_opener(opener)
        response = urllib2.urlopen(request)
        try:
            logger.debug('http_emitter: postback response: ' + str(response.read()))
        finally:
            response.close()
    except urllib2.HTTPError, e:
        if e.code == 202:
            logger.debug("http payload accepted")
        else:
            raise
Пример #2
0
def get_sub_area_logic(each):
    data = []
    url = 'http://you.ctrip.com/restaurantlist/%s.html' % each[-1]
    retry = 5
    html = ''
    while retry > 0:
        try:
            html = requests.get(url,
                                headers=headers,
                                proxies=config.get_proxy()).content.decode(
                                    config.ENCODING)
            break
        except:
            pass
        retry -= 1
    selector = etree.HTML(html)
    cons = selector.xpath('//div[@id="locationDiv"]/p/a')
    for i in cons:
        info = each.copy()
        area = i.xpath('text()')[0] if i.xpath('text()') else ''
        co = i.xpath('@onclick')[0] if i.xpath('@onclick') else ''
        code = re.findall('OnRegion\((.*?)\)', co)[0]
        info[5] = area
        info.append(code)
        data.append(info)
    return data
    def get_cars(self,
                 start_year,
                 end_year,
                 maker,
                 model,
                 seller_type,
                 condition,
                 keywords=''):
        links = []
        item_on_page = 1000
        url = f'{self.base_url}/{maker.lower().replace(" ","-")}/{model.lower().replace(" ","-")}/on/milton/'
        payload = {
            'rcp': f'{item_on_page}',
            'rsc': str(0),  #page number
            'srt': str(3),
            'yRng': f'{start_year},{end_year}',
            'prx': f'{self.search_radius}',
            'prv': 'Ontario',
            'loc': f'{self.post_code}',
            'hprc': True,
            'wcp': True,
            'sts': f'{condition}',
            'adtype': f'{seller_type}',
            'showcpo': str(1),
            'inMarket': 'advancedSearchs'
        }
        if keywords != '':
            payload['kwd'] = keywords

        session = requests.Session()
        adapter = requests.adapters.HTTPAdapter(max_retries=20)
        session.mount('https://', adapter)
        session.mount('http://', adapter)
        session.proxies.update(get_proxy())
        response = session.get(url, headers=HEADERS, params=payload)
        logging.debug('response status code: {}'.format(response.status_code))
        logging.debug("response url: {}".format(response.url))
        logging.debug("\n Maker:{} \n Model:{} \n Keyword: {}".format(
            maker, model, keywords))
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        cars_count = soup.find('div', {
            'class': 'results-count-wrapper'
        }).find('span', {
            'id': 'sbCount'
        }).text
        if int(cars_count) != 0:
            logging.debug('cars url: {}'.format(response.url))
            logging.debug('cars was find: {}'.format(cars_count))
            page_count = math.ceil(int(cars_count) / 15)
            j_data = soup.find('div', {
                'class': 'col-xs-12 disable-on-search'
            }).find('script', {'type': 'application/ld+json'})
            json_data = json.loads(
                str(j_data).replace('</script>', '').replace(
                    '<script type="application/ld+json">', '').strip())
            for data in json_data['offers']['offers']:
                links.append('https://www.autotrader.ca' + data['url'])

        return links
Пример #4
0
 def __init__(self, event=None):
     self.event = event
     self._session = Session()
     self.github_proxy = get_proxy()
     try:
         self._session.headers.update(
             {"Authorization": f"token {get_token()}"})
     except NoGitHubTokenException:
         self._session.auth = get_credentials()
     self._session.headers.update({"User-Agent": APP_NAME})
     if event is not None:
         self.issue_url = event['pull_request']['issue_url']
Пример #5
0
def kj_scrape_models():
    models_dict = {}
    session = requests.Session()
    adapter = requests.adapters.HTTPAdapter(max_retries=20)
    session.mount('https://', adapter)
    session.mount('http://', adapter)
    session.proxies.update(get_proxy())
    # url = 'https://www.kijijiautos.ca/cars/audi/#ms=1900&od=down&sb=relv3'
    for key, val in kj_get_makers().items():
        models_dict[key] = []
        url = 'https://www.kijijiautos.ca/cars/{}/#ms={}&od=down&sb=relv3'.format(
            key.lower().replace(' ', '-'), val)
        response = session.get(url, proxies=get_proxy())
        if response.status_code == 503:
            session = requests.Session()
            adapter = requests.adapters.HTTPAdapter(max_retries=20)
            session.mount('https://', adapter)
            session.mount('http://', adapter)
            session.proxies.update(get_proxy())
            # print('scraper was block')
            # print('waiting for response after 503')
            time.sleep(30)
            response = session.get(url, proxies=get_proxy())
        soup = bs4.BeautifulSoup(response.text, 'html.parser')
        models_block = soup.find_all('div', {'class': 'b3Ood7 dpzS6u'})
        try:
            models = models_block[1].find('select').find_all('option')
            for opt in models:
                model_name = opt.text
                model_id = opt.get('value')
                # print(key, model_name, model_id)
                models_dict[key].append({model_name: model_id})
        except IndexError as err:
            print("blank request to Other items")

    with open('kj_models.yaml', 'w') as outfile:
        yaml.dump(models_dict, outfile, default_flow_style=False)
Пример #6
0
def kj_scrape_makers():
    url = 'https://www.kijijiautos.ca/'
    makers_dict = {}
    response = requests.session().get(url,
                                      headers=KJ_HEADERS,
                                      proxies=get_proxy())
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    makers_block = soup.find('div', {'class': 'bpzS6u'})
    makers_id = makers_block.find_all('input')
    for shildik in makers_id:
        name = shildik.get('name')
        _id = shildik.get('id').split(' ')[1].replace('makes-', '')
        makers_dict[name] = _id
    with open('kj_makers.yaml', 'w') as outfile:
        yaml.dump(makers_dict, outfile, default_flow_style=False)
 def get_cars(self, start_year, end_year, maker, model, seller_type, condition, keywords=''):
     links = []
     if seller_type == 'Private':
         transform_seller_type = 'FSBO'
     elif seller_type == 'Diller':
         transform_seller_type = 'DILLER'
     else:
         transform_seller_type = ''
     
     model_qs = kj_get_models(maker, model)
     url = 'https://www.kijijiautos.ca/consumer/srp/by-params'
     payload = {
         'sb': 'relv3',
         'od': 'down',
         'ms': model_qs,
         'yc': f'{start_year}:{end_year}',
         'st': transform_seller_type,
         'ps': '0',
         'psz': '500',
         'vc': 'Car',
         # 'con': f'{str(condition).upper()}',
         'll': '43.52318260000001,-79.8547073',
         'rd': self.search_radius
     }
     if keywords != '':
         payload['q'] = keywords
     session = requests.Session()
     adapter = requests.adapters.HTTPAdapter(max_retries=20)
     session.mount('https://', adapter)
     session.mount('http://', adapter)
     session.proxies.update(get_proxy())
     resp = session.get(url, headers=KJ_HEADERS, params=payload)
     logging.debug('response status code: {}'.format(resp.status_code))
     logging.debug("response url: {}".format(resp.url))
     logging.debug("\n Maker:{} \n Model:{} \n Keyword: {}".format(maker, model, keywords))
     j_data = json.loads(resp.text)
     for i in j_data['listings']['items']:
         link = f'{self.base_url}/{maker.lower().replace(" ","-")}/{model.lower().replace(" ","-")}/{condition.lower()}/#vip={i["id"]}'
         links.append(link)
     
     return links
Пример #8
0
def http_emitter(message, logger, agentConfig):
    logger.debug('http_emitter: start')

    # Post back the data
    postBackData = format_body(message)

    logger.debug('http_emitter: attempting postback to ' +
                 agentConfig['dd_url'])

    # Build the request handler
    apiKey = message.get('apiKey', None)
    if not apiKey:
        raise Exception("The http emitter requires an api key")

    url = "%s/intake?api_key=%s" % (agentConfig['dd_url'], apiKey)
    headers = post_headers(agentConfig, postBackData)

    proxy_settings = get_proxy(agentConfig)
    urllib2 = get_http_library(proxy_settings, agentConfig['use_forwarder'])

    try:
        request = urllib2.Request(url, postBackData, headers)
        # Do the request, logger any errors
        opener = get_opener(logger, proxy_settings,
                            agentConfig['use_forwarder'], urllib2)
        if opener is not None:
            urllib2.install_opener(opener)
        response = urllib2.urlopen(request)
        try:
            logger.debug('http_emitter: postback response: ' +
                         str(response.read()))
        finally:
            response.close()
    except urllib2.HTTPError, e:
        if e.code == 202:
            logger.debug("http payload accepted")
        else:
            raise
Пример #9
0
def get_city_list():
    prov_list = [
        i.strip().split('\u0001')
        for i in open(config.PROVS_LIST, 'r', encoding='utf8')
    ]
    all_citys = [
        i.strip().split(config.BLANK)
        for i in open(config.ALL_CITY_LIST, 'r', encoding=config.ENCODING)
    ]
    data = []
    for each in prov_list:
        print(each)
        url = 'http://you.ctrip.com/sitemap/place/c%s' % each[2]
        html = requests.get(url, headers=headers,
                            proxies=config.get_proxy()).content.decode('utf8')
        selector = etree.HTML(html)
        cons = selector.xpath('//div[@class="sitemap_block"]/ul[1]/li')
        for i in cons:
            city = i.xpath('a/text()')[0].replace('旅游攻略', '')
            url = i.xpath('a/@href')[0].split('/')[-1].replace('.html', '')
            data.append([each[0], city, url])

    text = ''
    for i in data:
        for t in all_citys:
            if i[0] in t and i[1] in t:
                count = t.index(i[1])
                if count == 2 or count == 3:
                    t[4], t[5] = '', ''
                    t[-1] = str(re.findall('\d\d\d\d', t[-1])[0]) + '00'
                    text += config.BLANK.join(t) + config.BLANK + i[-1] + '\n'
                else:
                    text += config.BLANK.join(t) + config.BLANK + i[-1] + '\n'
                break

    with open(config.CITY_LIST, 'a', encoding=config.ENCODING) as f:
        f.write(text)