예제 #1
0
def crawl_state():
    """Create all state info.
    """
    logger = create_zillow_crawler_logger()
    driver = create_webdriver()
    input("Press Enter when your browser is ready... ")

    logger.info("Crawl all state info ...")
    with driver as driver:
        url = urlencoder.browse_home_listpage_url()
        try:
            html = driver.get_html(url)
            while True:
                if "http://www.google.com/recaptcha/api.js" in html:
                    logger.info("Captcha Warning!", 1)
                    input("Please Solve the Captcha! Then Press Enter ...")
                    html = driver.get_html(url)
                else:
                    break

            data = list()
            try:
                for link, name in htmlparser.get_items(html, url):
                    state = State(_id=link, name=name, status=StatusCode.todo)
                    state.key = state.get_key()
                    data.append(state)
            except Exception as e1:  # HtmlParseError or CaptchaError, most like zillow blocks me
                logger.error("%s" % e1, 1)

            if len(data):
                State.smart_insert(data)
        except Exception as e2:
            logger.error("failed to crawl %s: %s" % (url, e2), 1)
예제 #2
0
def crawl_county():
    """Create all county info.
    """
    logger = create_zillow_crawler_logger()
    driver = create_webdriver()
    input("Press Enter when your browser is ready... ")

    # select todo list
    filters = {"status": {"$ne": StatusCode.finished}}
    state_list = State.by_filter(filters).all()
    logger.info("Crawl county from %s state ..." % len(state_list))

    with driver as driver:
        for state in state_list:
            url = state.url
            logger.info("Crawl %s ..." % url, 1)

            try:
                # get html
                html = driver.get_html(url)
                while True:
                    if "http://www.google.com/recaptcha/api.js" in html:
                        logger.info("Captcha Warning!", 1)
                        input("Please Solve the Captcha! Then Press Enter ...")
                        html = driver.get_html(url)
                    else:
                        break

                # parse data
                try:
                    data = list()
                    for link, name in htmlparser.get_items(html, url):
                        county = County(
                            _id=link,
                            state=state.key,
                            name=name,
                            status=StatusCode.todo,
                        )
                        county.key = county.get_key()
                        data.append(county)

                    # page has many items
                    if len(data):
                        County.smart_insert(data)
                        state.n_children = len(data)
                        state.status = StatusCode.finished
                        logger.info("Success", 2)
                    # most likely this listpage has no items, there's no error
                    else:
                        state.status = StatusCode.crawled_but_has_error
                        logger.info("No data", 2)

                except Exception as e1:  # HtmlParseError or CaptchaError, most like zillow blocks me
                    state.status = StatusCode.crawled_but_has_error
                    logger.error("%r" % e1, 2)

            except Exception as e2:  # HttpError
                logger.error("Http error: %s" % e2, 2)
                state.status = StatusCode.failed_to_crawl

            state.save()
예제 #3
0
def crawl_address():
    """Create all county info.
    """
    logger = create_zillow_crawler_logger()
    driver = create_webdriver()
    input("Press Enter when your browser is ready... ")

    # select todo list
    filters = {
        "state": "md",
        "status": {
            "$ne": StatusCode.finished
        },
    }
    street_list = list(Street.by_filter(filters))
    logger.info("Crawl address from %s street ..." % len(street_list))

    counter = len(street_list)
    with driver as driver:
        for street in street_list:
            counter -= 1
            url = street.url
            logger.info("Crawl %s, %s left ..." % (url, counter), 1)

            col = address_col_mapper[street.state]
            try:
                # get html
                html = driver.get_html(url)
                while True:
                    if "http://www.google.com/recaptcha/api.js" in html:
                        logger.info("Captcha Warning!", 1)
                        input("Please Solve the Captcha! Then Press Enter ...")
                        html = driver.get_html(url)
                    else:
                        break

                # parse data
                try:
                    data = list()
                    for link, name in htmlparser.get_items(html, url):
                        address = Address(
                            _id=link,
                            state=street.state,
                            county=street.county,
                            zipcode=street.zipcode,
                            street=street.key,
                            name=name,
                            status=StatusCode.todo,
                        )
                        address.key = address.get_key()
                        doc = address.to_dict()
                        data.append(doc)

                    # page has many items
                    if len(data):
                        # 因为我们将address按照state分表
                        # 所以使用pymongo_mate.smart_insert中的方法
                        pymongo_mate.smart_insert(col, data)
                        street.n_children = len(data)
                        street.status = StatusCode.finished
                        logger.info("Success", 2)
                    # most likely this listpage has no items, there's no error
                    else:
                        street.status = StatusCode.crawled_but_has_error
                        logger.info("No data", 2)

                # HtmlParseError or CaptchaError, most like zillow blocks me
                except Exception as e1:
                    street.status = StatusCode.crawled_but_has_error
                    logger.error("%r" % e1, 2)

            # HttpError
            except Exception as e2:
                logger.error("Http error: %s" % e2, 2)
                street.status = StatusCode.failed_to_crawl

            street.save()
예제 #4
0
def crawl_house_detail_from_zillow():
    """Crawl house detail from zillow. Zillow has more address available than
    Trulia.
    """
    logger = create_trulia_crawler_logger()

    def select_address():
        filters = {
            "status_zillow": StatusCode.todo,
            "county": "montgomery-county",
        }
        wanted = {
            "_id": True,
            #             "name": True,
            #             "county": True,
            "state": True,
            #             "zipcode": True,
        }
        data = list()
        for state in [
                "md",
        ]:
            col = address_col_mapper[state]
            for doc in col.find(filters, wanted):
                data.append(doc)
        return data

    logger = create_zillow_crawler_logger()
    address_list = select_address()
    counter = len(address_list)
    logger.info("Crawl %s address detail ..." % counter)

    driver = create_webdriver()
    input("Press Enter when your browser is ready... ")

    for doc in address_list:
        counter -= 1
        url = urlencoder.url_join(doc["_id"])
        logger.info("Crawl %s, %s left ..." % (url, counter))

        set_doc = dict()
        try:
            # get html
            html = driver.get_html(url)
            while True:
                if "http://www.google.com/recaptcha/api.js" in html:
                    logger.info("Captcha Warning!", 1)
                    input("Please Solve the Captcha! Then Press Enter ...")
                    html = driver.get_html(url)
                else:
                    break

            try:
                data = htmlparser.get_house_detail(html)
                if data is None:
                    set_doc["status_zillow"] = StatusCode.crawled_but_has_error
                    logger.info(exc.ParseError(url), 1)
                else:
                    set_doc["zillow_detail"] = data
                    set_doc["status_zillow"] = StatusCode.finished
                    logger.info("Success!", 1)

            except Exception as e1:
                set_doc["status_zillow"] = StatusCode.crawled_but_has_error
                logger.info(exc.ParseError(str(e)), 1)

        except Exception as e2:
            set_doc["status_zillow"] = StatusCode.failed_to_crawl
            logger.info("http request error: %s" % url, 1)

        col = address_col_mapper[doc["state"]]
        col.update_one({"_id": doc["_id"]}, {"$set": set_doc})

    logger.info("Complete!")
예제 #5
0
    port=27017,
)


def read_all_state():
    all_state = [doc["key"] for doc in state_col.find()]
    all_state.sort()
    return all_state


#--- Unittest ---
if __name__ == "__main__":
    from zillowdb.logger import create_zillow_crawler_logger
    from zillowdb.model import StatusCode

    logger = create_zillow_crawler_logger()

    def fix_dc():
        """
        1. find dc state in state_col, change _id, key to "dc", status to 0, 
        n_children to None
        2. find all county {"state": "district-of-columbia-county"}
        """
        county_col.remove({"state": "district-of-columbia-county"})
        zipcode_col.remove({"state": "district-of-columbia-county"})

#     fix_dc()

    def browse_status():
        import prettytable