Пример #1
0
def parse_details_html(url):
    try:
        soup = to_soup(url)

        basic_data = as_dict(
                table_into_json(soup.find_all('section', {'id': 'szczegoly-oferty'})[0].table))
        extended_data_sections = soup.find_all('section', {'id': 'dodatkowe-oplaty'})
        extended_data = {}
        if extended_data_sections:
            extended_data = as_dict(table_into_json(extended_data_sections[0].table))

        return {
            u"Link": url,
            u"Region": retrieve_meta(soup, 'dimension-region'),
            u"Ulica": retrieve_meta(soup, "streetAddress", "itemprop").lower().strip("ul.").strip(),
            u"Cena mieszkania": as_int(retrieve_meta(soup, 'dimension-price')),
            u"Cena za metr": as_int(retrieve_meta(soup, 'dimension-price-m2')),
            u"Powierzchnia": as_float(retrieve_meta(soup, 'dimension-area')),
            u"Pokoje": as_int(retrieve_meta(soup, 'dimension-rooms')),
            u'Cena parkingu': parse_parking_places(basic_data['Miejsca postojowe:'][1]),
            u'Piętro': as_int(retrieve_meta(soup, 'dimension-floor')),
            u"Koszty dodatkowe": sum([as_int(value) for value in extended_data.values()]) or None,
            u"Długosć geograficzna": as_float(retrieve_meta(soup, 'longitude', 'itemprop')),
            u"Szerokość geograficzna": as_float(retrieve_meta(soup, 'latitude', 'itemprop')),
            u"Termin": as_date(
                    one_of(basic_data, ['Realizacja inwestycji:', u'Realizacja nieruchomości:'])[
                    -16:-6])
        }
    except requests.exceptions.ConnectionError:
        return url
    except Exception, e:
        raise Exception("Failed to fetch %s; %s" % (url, traceback.format_exc()), e)
Пример #2
0
def scrap_content(url):
    res = get_request(url)
    soup = to_soup(res)

    content = soup.find("div", {"id": "inner-block"})
    content = remove_attributes(content)
    return clear_content(content)
Пример #3
0
def parse_search_page(page, region=11158):
    url = ("https://rynekpierwotny.pl/oferty/?type=&region={region}"
           "&distance=0&price_0=&price_1=&area_0=&area_1=&rooms_0=&rooms_1="
           "&construction_end_date=&price_m2_0=&price_m2_1=&floor_0=&floor_1="
           "&offer_size=&keywords=&is_luxury=&page={page}&is_mdm=&is_holiday=&lat=&lng=&sort=").format(
            region=region,
            page=page
    )
    soup = to_soup(url)
    links = []
    for result in soup.find_all('h2', {'class': 'offer-item-name'}):
        links.append(make_link(result.a))
    return links
Пример #4
0
def parse_invest_html(url):
    try:
        soup = to_soup(url)
        body = soup.find_all('div', {"data-config-table-container": "propertyListFull"})

        links = []
        if body:
            for row in body[0].tbody.find_all('tr'):
                potential_links = row.find_all('a', href=True)
                if len(potential_links) == 2:
                    links.append(make_link(potential_links[1]))
                else:
                    links.append(make_link(potential_links[0]))
        return links
    except Exception, e:
        raise Exception("Failed to fetch %s; %s" % (url, traceback.format_exc()), e)
Пример #5
0
def get_public_transport_time(gps_from, gps_to, time=datetime.datetime.now()):
    weekday = (int(time.strftime("%w")) + 5) % 6

    url = (
        "http://www.m.rozkladzik.pl/krakow/wyszukiwarka_polaczen.html?"
        "from={from_x};{from_y}|c|{from_x}|{from_y}&"
        "to={to_x};{to_y}|c|{to_x}|{to_y}&profile=opt&maxWalkChange=400&minChangeTime=2&time={time}&day={day}".format(
            from_x=gps_from[0],
            from_y=gps_from[1],
            to_x=gps_to[0],
            to_y=gps_to[1],
            time=time.strftime("%H:%M"),
            day=weekday,
        )
    )

    soup = to_soup(url)

    times = []
    for sum_row in soup.find_all("div", {"class": "route_sum_row"}):
        time_td = sum_row.find_all("td", {"class": "time"})[0]
        times.append(as_int(stringify_child(time_td)[1]))
    return min(times)
Пример #6
0
def crawl_articles(url):
    res = get_request(url)
    soup = to_soup(res)
    divs = soup.find_all("div", {"class": "vest_container"})
    news_data = []

    for div in divs:
        date = div.find("div", {"class", "meta"}).text
        picture = div.find("img")["src"]
        description = div.find("p").text
        title = div.find("h2").text
        tip = div.find("div", {"class", "img_desc"}).text
        url = div.find("a")["href"]
        content = scrap_content(url)
        news_data.append({
            "title": title,
            "dateInfo": date,
            "picture": picture,
            "description": description,
            "type": tip,
            "url": url,
            "content": content
        })
    return news_data