Пример #1
0
def cleanlistings(listings):
    """Clean a list of listings and return a clean unit object.

    Arguments:
        listings: pymongo cursor containing all listings

    Returns:
        attrs: set of boolean attributes found in all the listings
        units: list of Unit objects representing the cleaned listings
    """
    logger = get_configured_logger('DEBUG', __name__)
    units = []
    attrs = set()
    count = listings.count()
    logger.info('Cleaning {} listings.'.format(count))
    for i, listing in enumerate(listings):
        logger.info('Cleaning listing {} of {}.'.format(i + 1, count))
        l = Listing(listing['content'], listing['_id'], listing['zipcode'])
        try:
            unit = l.clean()
            units.append(unit)
            logger.debug('Attributes: {}'.format(l.getattrs()))
            attrs.update(l.getattrs())
        except Exception as e:
            logger.warn('Caught exception while cleaning.')
            logger.warn('Listing URL: {}'.format(listing['link']))
            logger.warn(e)
            break

    return attrs, units
Пример #2
0
def main(argv):
    """Main entry-point for batchclean."""
    remove = False
    if len(argv) > 1 and argv[1] == 'remove':
        remove = True

    logger = get_configured_logger('DEBUG', __name__)

    mongoclient = get_mongoclient()
    logger.info('Retrieved mongoclient.')

    listings = queryforlistings(mongoclient)
    logger.info('Found {} listings.'.format(listings.count()))

    if remove:
        removed = findremoved(listings)
        logger.info('{} listings have been removed, deleting from DB.'.format(
            len(removed)))
        removelistings(mongoclient, removed)

    listings = queryforlistings(mongoclient)
    attrs, units = cleanlistings(listings)
    logger.info('Observed {} unique attributes while cleaning.'.format(
        len(attrs)))
    logger.info('Processed {} units'.format(len(units)))
    writeunitstomongo(mongoclient, units)
    writeattrstomongo(mongoclient, attrs)
Пример #3
0
 def __init__(self, content, listing_id, zipcode):
     """Inits a Listing with the raw HTML content."""
     self.attrset = set()
     self.content = content
     self.listing_id = listing_id
     self.logger = get_configured_logger('DEBUG', __name__)
     self.soup = BeautifulSoup(self.content, 'html.parser')
     self.unit = Unit(listing_id, zipcode)
     self.zipcode = zipcode
Пример #4
0
def run_both(city, zipcodes, mongoclient):
    logger = get_configured_logger('DEBUG', __name__)
    for zipcode in zipcodes:
        zipcodesearch = ZipCodeSearch(city, zipcode, mongoclient)
        zipcodesearch.execute()
        logger.info('Compiled listings for zip code {}'.format(zipcode))
        contentscraper = ContentScraper()
        contentscraper.execute(zipcode, mongoclient)
        logger.info('Gathered listings for zip code {}'.format(zipcode))
Пример #5
0
def scrape_content(zipcodes, mongoclient):
    """Scrape listing content for a particular zip code and write to mongo.

    Arguments:
        zipcode: str, the zipcode to search in for listings
        mongoclient: MongoClient object in which to write results
    """
    logger = get_configured_logger('DEBUG', __name__)
    for zipcode in zipcodes:
        contentscraper = ContentScraper()
        contentscraper.execute(zipcode, mongoclient)
        logger.info('Gathered listings for zip code {}'.format(zipcode))
Пример #6
0
def run_search(city, zipcodes, mongoclient):
    """Run a search for listings in a specific zip code and write to mongoDB.

    City is required as well because it is part of the base URL for searching.

    Arguments:
        city: str, the city to use in the URL
        zipcode: str, the zipcode to search in for listings
        mongoclient: MongoClient object in which to write results
    """
    logger = get_configured_logger('DEBUG', __name__)
    for zipcode in zipcodes:
        zipcodesearch = ZipCodeSearch(city, zipcode, mongoclient)
        zipcodesearch.execute()
        logger.info('Compiled listings for zip code {}'.format(zipcode))
Пример #7
0
def removelistings(mongoclient, removed):
    """Remove a list of listings from the database.

    Arguments:
        mongoclient: database client for the database containing the listings.
        remove: list of listing to remove
    """
    logger = get_configured_logger('DEBUG', __name__)
    logger.info('Removing {} listings.'.format(len(removed)))
    listing_collection = mongoclient.scraper.listing
    count = len(removed)
    for i, listing in enumerate(removed):
        logger.debug('  {} of {} removed from DB.'.format(i + 1, count))
        query = {"_id": listing['_id']}
        listing_collection.delete_one(query)
Пример #8
0
    def __init__(self):
        """Init ContentScraper.

        proxy is set to value of HTTP_PROXY environment variable
        logger is retrieved from get_configured_logger function
        """
        self.logger = get_configured_logger('DEBUG', __name__)
        self.mongoclient = None
        self.proxy = os.environ['HTTP_PROXY']
        self.sleeplong = 2
        self.sleepshort = 0.5
        self.ua = UserAgent()
        self.zipcode = None

        self.logger.info('ListingScraper initialized for zip code {}'.format(
            self.zipcode))
    def __init__(self, city, state):
        """Inits ZipCodeRequest with city and state.

        apikey is set to value of ZIP_KEY environment variable
        base is hardcoded to Zip Code API URL
        form is hardcoded to 'json'
        """
        self.apikey = os.environ['ZIP_KEY']
        self.base = (
            'https://www.zipcodeapi.com/rest/{}/city-zips.{}/{}/{}'
        )
        self.city = city
        self.form = 'json'
        self.logger = get_configured_logger('DEBUG', __name__)
        self.state = state

        self.logger.info('ZipCodeRequest initialized')
Пример #10
0
    def __init__(self, city, zipcode, mongoclient):
        """Init ZipCodeSearch object with city, zipcode, and mongoclient.

        base is set to value of BASE_URL environment variable
        proxy is set to value of HTTP_PROXY environment variable
        logger is retrieved from get_configured_logger function
        """
        self.base = os.environ['BASE_URL']
        self.city = city.lower()
        self.logger = get_configured_logger('DEBUG', __name__)
        self.mongoclient = mongoclient
        self.proxy = os.environ['HTTP_PROXY']
        self.sleeplong = 2
        self.sleepshort = 0.5
        self.ua = UserAgent()
        self.zipcode = zipcode

        self.logger.info('ZipCodeSearch initialized for zip code {}'.format(
            self.zipcode))
Пример #11
0
def findremoved(listings):
    """Get a list of the listings that have been removed.

    Arguments:
        listings: pymongo cursor with listings to check.

    Returns:
        removed: list of listings that have been removed.
    """
    logger = get_configured_logger('DEBUG', __name__)
    removed = []
    count = listings.count()
    for i, listing in enumerate(listings):
        logger.debug('Checking listing {} of {}.'.format(i, count))
        l = Listing(listing['content'], listing['_id'], listing['zipcode'])
        if l.isremoved():
            logger.debug('Listing {} of {} not there anymore.'.format(
                i, count))
            logger.debug('Adding {} to the removal list.'.format(
                listing['link']))
            removed.append(listing)
        if not l.hasprice():
            removed.append(listing)
    return removed