Exemplo n.º 1
0
def parse_post(post, craigslist_area_name):
    areas = get_areas()
    area_timezone = areas[craigslist_area_name]['timezone']
    pid = int(post.get('data-pid'))
    respost_pid = int(
        post.get('data-repost-of')) if post.get('data-repost-of') else None
    date_orig = post.cssselect('time')[0].get('datetime')
    date = arrow.get(date_orig).replace(
        tzinfo=area_timezone).to('utc').isoformat()
    url_orig = post.cssselect("p.result-info > a")[0].get('href')
    url = url_orig if "http" in url_orig else get_url_base(
        craigslist_area_name) + url_orig
    url = http_to_https(url)
    title = post.cssselect("p.result-info > a")[0].text
    price_el = get_only_first_or_none(
        post.cssselect("span.result-meta > span.result-price"))
    price_raw = price_el.text if price_el is not None else None
    price = int(price_raw.replace("$", "")) if price_raw else None
    housing_el = get_only_first_or_none(
        post.cssselect("p.result-info > span > span.housing"))
    housing = [x.strip() for x in housing_el.text.split("-\n") \
        if x.strip()] if housing_el is not None else []
    bedrooms_raw = get_only_first_or_none([x for x in housing if "br" in x])
    num_bedrooms = int(bedrooms_raw.replace("br",
                                            "")) if bedrooms_raw else None
    area_raw = get_only_first_or_none([x for x in housing if "ft" in x])
    area = int(area_raw.replace("ft", "")) if area_raw else None
    return RegularSearchPost(
        **{
            "id": pid,
            "title": title,
            "url": url,
            "repost_id": respost_pid,
            "price": price,
            "bedrooms": num_bedrooms,
            "date": date,
            "area": area,
        })
Exemplo n.º 2
0
def process_post_url_output(body):

    if "<title>craigslist | post not found</title>" in body or '<title>craigslist | Page Not Found</title>' in body:
        raise CraigslistException("post not found")

    try:
        id_ = int(re.search(r'var pID = "(\d+)";', body).groups()[0])
    except IndexError:
        raise CraigslistException("post id not found on page")

    try:
        repost_id = re.search(r'var repost_of = (\d+);', body).groups()[0]
    except AttributeError:
        repost_id = None

    doc = lxml.html.fromstring(body)
    url = http_to_https(doc.cssselect("link[rel=canonical]")[0].get('href'))
    full_title = " ".join([
        x.text_content() for x in doc.cssselect(
            "h2.postingtitle span.postingtitletext")[0].getchildren()[:-1]
    ])
    short_title = doc.cssselect(
        "h2.postingtitle span.postingtitletext #titletextonly")[0].text

    try:
        # TODO: deal with international prices
        price = doc.cssselect(
            "h2.postingtitle span.postingtitletext .price")[0].text.replace(
                '$', '')
    except IndexError:
        price = None

    try:
        housing_el = doc.cssselect(
            "h2.postingtitle span.postingtitletext .housing")[0]
    except IndexError:
        housing_el = None

    if housing_el is not None:
        try:
            num_bedrooms, area = parse_housing_el(
                housing_el.text.replace('/ ', ''))
        except Exception:
            num_bedrooms, area = None, None
    else:
        num_bedrooms, area = None, None

    try:
        hood = doc.cssselect(
            "h2.postingtitle span.postingtitletext #titletextonly + small"
        )[0].text.strip().lstrip('(').rstrip(')')
    except IndexError:
        hood = None

    try:
        address = doc.cssselect("div.mapaddress")[0].text
    except IndexError:
        address = None

    body_el = doc.cssselect("#postingbody")[0]
    el_to_remove = body_el.cssselect('div.print-qrcode-container')[0]
    body_el.remove(el_to_remove)
    body_html = lxml.html.tostring(body_el).decode('utf-8')
    body_text = body_el.text_content().strip()
    # doc.cssselect("div.mapAndAttrs p.attrgroup") ????
    # [a.get('href') for a in doc.cssselect("#thumbs a")]
    return DetailPost(id=id_,
                      repost_id=repost_id,
                      url=url,
                      full_title=full_title,
                      short_title=short_title,
                      hood=hood,
                      num_bedrooms=num_bedrooms,
                      sqftage=area,
                      price=price,
                      body_html=body_html,
                      body_text=body_text,
                      address=address)