def cleanlistings(listings): """Clean a list of listings and return a clean unit object. Arguments: listings: pymongo cursor containing all listings Returns: attrs: set of boolean attributes found in all the listings units: list of Unit objects representing the cleaned listings """ logger = get_configured_logger('DEBUG', __name__) units = [] attrs = set() count = listings.count() logger.info('Cleaning {} listings.'.format(count)) for i, listing in enumerate(listings): logger.info('Cleaning listing {} of {}.'.format(i + 1, count)) l = Listing(listing['content'], listing['_id'], listing['zipcode']) try: unit = l.clean() units.append(unit) logger.debug('Attributes: {}'.format(l.getattrs())) attrs.update(l.getattrs()) except Exception as e: logger.warn('Caught exception while cleaning.') logger.warn('Listing URL: {}'.format(listing['link'])) logger.warn(e) break return attrs, units
def main(argv): """Main entry-point for batchclean.""" remove = False if len(argv) > 1 and argv[1] == 'remove': remove = True logger = get_configured_logger('DEBUG', __name__) mongoclient = get_mongoclient() logger.info('Retrieved mongoclient.') listings = queryforlistings(mongoclient) logger.info('Found {} listings.'.format(listings.count())) if remove: removed = findremoved(listings) logger.info('{} listings have been removed, deleting from DB.'.format( len(removed))) removelistings(mongoclient, removed) listings = queryforlistings(mongoclient) attrs, units = cleanlistings(listings) logger.info('Observed {} unique attributes while cleaning.'.format( len(attrs))) logger.info('Processed {} units'.format(len(units))) writeunitstomongo(mongoclient, units) writeattrstomongo(mongoclient, attrs)
def __init__(self, content, listing_id, zipcode): """Inits a Listing with the raw HTML content.""" self.attrset = set() self.content = content self.listing_id = listing_id self.logger = get_configured_logger('DEBUG', __name__) self.soup = BeautifulSoup(self.content, 'html.parser') self.unit = Unit(listing_id, zipcode) self.zipcode = zipcode
def run_both(city, zipcodes, mongoclient): logger = get_configured_logger('DEBUG', __name__) for zipcode in zipcodes: zipcodesearch = ZipCodeSearch(city, zipcode, mongoclient) zipcodesearch.execute() logger.info('Compiled listings for zip code {}'.format(zipcode)) contentscraper = ContentScraper() contentscraper.execute(zipcode, mongoclient) logger.info('Gathered listings for zip code {}'.format(zipcode))
def scrape_content(zipcodes, mongoclient): """Scrape listing content for a particular zip code and write to mongo. Arguments: zipcode: str, the zipcode to search in for listings mongoclient: MongoClient object in which to write results """ logger = get_configured_logger('DEBUG', __name__) for zipcode in zipcodes: contentscraper = ContentScraper() contentscraper.execute(zipcode, mongoclient) logger.info('Gathered listings for zip code {}'.format(zipcode))
def run_search(city, zipcodes, mongoclient): """Run a search for listings in a specific zip code and write to mongoDB. City is required as well because it is part of the base URL for searching. Arguments: city: str, the city to use in the URL zipcode: str, the zipcode to search in for listings mongoclient: MongoClient object in which to write results """ logger = get_configured_logger('DEBUG', __name__) for zipcode in zipcodes: zipcodesearch = ZipCodeSearch(city, zipcode, mongoclient) zipcodesearch.execute() logger.info('Compiled listings for zip code {}'.format(zipcode))
def removelistings(mongoclient, removed): """Remove a list of listings from the database. Arguments: mongoclient: database client for the database containing the listings. remove: list of listing to remove """ logger = get_configured_logger('DEBUG', __name__) logger.info('Removing {} listings.'.format(len(removed))) listing_collection = mongoclient.scraper.listing count = len(removed) for i, listing in enumerate(removed): logger.debug(' {} of {} removed from DB.'.format(i + 1, count)) query = {"_id": listing['_id']} listing_collection.delete_one(query)
def __init__(self): """Init ContentScraper. proxy is set to value of HTTP_PROXY environment variable logger is retrieved from get_configured_logger function """ self.logger = get_configured_logger('DEBUG', __name__) self.mongoclient = None self.proxy = os.environ['HTTP_PROXY'] self.sleeplong = 2 self.sleepshort = 0.5 self.ua = UserAgent() self.zipcode = None self.logger.info('ListingScraper initialized for zip code {}'.format( self.zipcode))
def __init__(self, city, state): """Inits ZipCodeRequest with city and state. apikey is set to value of ZIP_KEY environment variable base is hardcoded to Zip Code API URL form is hardcoded to 'json' """ self.apikey = os.environ['ZIP_KEY'] self.base = ( 'https://www.zipcodeapi.com/rest/{}/city-zips.{}/{}/{}' ) self.city = city self.form = 'json' self.logger = get_configured_logger('DEBUG', __name__) self.state = state self.logger.info('ZipCodeRequest initialized')
def __init__(self, city, zipcode, mongoclient): """Init ZipCodeSearch object with city, zipcode, and mongoclient. base is set to value of BASE_URL environment variable proxy is set to value of HTTP_PROXY environment variable logger is retrieved from get_configured_logger function """ self.base = os.environ['BASE_URL'] self.city = city.lower() self.logger = get_configured_logger('DEBUG', __name__) self.mongoclient = mongoclient self.proxy = os.environ['HTTP_PROXY'] self.sleeplong = 2 self.sleepshort = 0.5 self.ua = UserAgent() self.zipcode = zipcode self.logger.info('ZipCodeSearch initialized for zip code {}'.format( self.zipcode))
def findremoved(listings): """Get a list of the listings that have been removed. Arguments: listings: pymongo cursor with listings to check. Returns: removed: list of listings that have been removed. """ logger = get_configured_logger('DEBUG', __name__) removed = [] count = listings.count() for i, listing in enumerate(listings): logger.debug('Checking listing {} of {}.'.format(i, count)) l = Listing(listing['content'], listing['_id'], listing['zipcode']) if l.isremoved(): logger.debug('Listing {} of {} not there anymore.'.format( i, count)) logger.debug('Adding {} to the removal list.'.format( listing['link'])) removed.append(listing) if not l.hasprice(): removed.append(listing) return removed