Exemplo n.º 1
0
def get_poi_list(url, crawler_list, kind):
    pois = []
    page = get_page(url)
    if kind == 'things-to-do':
        items = page.find_all('div', class_='attraction_clarity_cell')
    else:
        items = page.find_all('div',
                              id=lambda x: x and x.startswith('eatery_'))

    for item in items:
        poi = {
            'href': item.find('a').get('href'),
            'name': item.find('a').get_text().strip()
        }
        url = base_url + poi['href']
        if is_exists(url):
            continue
        poi_data = ''
        if url.endswith('html.html'):
            printer('yellow', 'Not download', url)
            continue
        try:
            poi_data = get_poi(url, poi, crawler_list, kind)
        except Exception as e:
            msg = '%s - %s' % (url, e)
            printer('red', 'Error', msg)
        if poi_data:
            set_data(poi_data)
    return pois
Exemplo n.º 2
0
def get_page(url):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0'
    }
    printer('blue', 'Get', url)
    try:
        html = requests.get(url, headers=headers)
    except:
        pass
    sleep(1)
    soup = BeautifulSoup(html.content, 'html.parser')
    return soup
def set_data(data):
    # if is_exists(data['url']):
    #     return False
    data['create_date'] = datetime.now()
    data['updated'] = False
    satl = Satl(data['url'], data=data)
    printer('magenta', 'Save', " %s - %s" % (satl.pk, satl.get('name')))
    satl.save()

    # this part writen beacuse of update images
    # else:
    #     satl = Satl(data['url']).load()
    return False
Exemplo n.º 4
0
def get_images(satl_obj):
    count = satl_obj.count_files()
    if count != 0:
        return False
    index = 1
    for url in satl_obj.get('images'):
        printer('cyan', 'Download', url)
        try:
            img = requests.get(url)
            satl_obj.attach_file_object(img.content, '%s.jpg' % index)
        except:
            pass
        index += 1
    return True
Exemplo n.º 5
0
def make_pages_and_normalize_input(loop, keys):
    if loop == 0:
        page = ''
    else:
        page = 'oa%s-' % (loop * 30)

    if 'state' in keys:
        state = keys['state']
    else:
        state = keys['country']

    printer('green', 'Country', keys['country'])

    if 'name' in keys:
        name = keys['name']
    else:
        name = keys['country']

    return page, state, name
Exemplo n.º 6
0
def get_poi(url, data_dict, crawler_list, kind):
    page = get_page(url)

    popularity = get_text(page.find('span', class_='header_popularity'))

    location = page.find(string=re.compile(r"lat:(.*?)"))
    images_obj = page.find('div', class_='page_images').find_all(
        'img', class_='centeredImg noscript')
    images = []
    for image in images_obj:
        images.append(image.get('src'))
    lat_pattern = re.compile(r"lat: (.*?),", re.MULTILINE | re.DOTALL)
    long_pattern = re.compile(r"lng: (.*?),", re.MULTILINE | re.DOTALL)
    lat = lat_pattern.search(location).group(1)
    lng = long_pattern.search(location).group(1)

    description = get_text(page.find('div', class_='description overflow'))
    hours = get_text(page.find('div', class_='section hours'))
    address = get_text(page.find('div', class_='section location'))
    phone = get_text(page.find('div', class_='detail_section phone'))

    if kind == 'resturant':
        data_dict['name'] = get_text(page.find('h1', class_='heading_title'))
    data_dict['comment'] = get_text(
        page.find('div', class_='prw_reviews_text_summary_hsx'))
    data_dict['popularity'] = popularity
    data_dict['location'] = {'lat': float(lat), 'long': float(lng)}
    data_dict['images'] = images
    data_dict['url'] = url
    data_dict['description'] = description
    data_dict['hours'] = hours
    data_dict['address'] = address
    data_dict['phone'] = phone
    data_dict['type'] = kind
    data_dict['country'] = crawler_list['country']
    data_dict['state'] = crawler_list['state']
    printer('yellow', kind,
            '%s - %s' % (data_dict['country'], data_dict['state']))
    return data_dict
def get_poi_list(url, crawler_list, kind):
    pois = []
    page = get_page(url)
    if kind == 'things-to-do':
        jsons = page.find_all("script", type="application/ld+json")
        for j in jsons:
            dataj = json.loads(j.string)
            typej = dataj['@type']
            if typej == 'ItemList':
                items = dataj['itemListElement']

    else:
        items = page.find_all('div',
                              id=lambda x: x and x.startswith('eatery_'))
    # print(dataj)

    for item in items:
        poi = {'name': item['name']}
        url = base_url + item['url']

        if is_exists(url):
            continue
        poi_data = ''
        if url.endswith('html.html'):
            printer('yellow', 'Not download', url)
            continue
        try:
            poi_data = get_poi(url, poi, crawler_list, kind)
        except Exception as e:
            msg = '%s - %s' % (url, e)
            printer('red', 'Error', msg)
        if poi_data:
            # set_data(poi_data)
            pois.append(poi_data)

    return pois