class GoogleMap: keys = {} clients = [] nyc = { 'new york county', 'bronx county', 'kings county', 'new york county', 'queens county', 'richmond county' } def __init__(self): self.mongo = MongoDict() self.logger = getLogger("GoogleMap") getLogger('requests.packages.urllib3.connectionpool').setLevel( logging.ERROR) file = os.path.join( os.path.dirname(os.path.abspath( inspect.getsourcefile(DummyClass))), 'api.key') # dynamically load key from api.key with open(file) as f: for line in f: [app_name, app_key] = line.strip().split(" ") self.logger.info("Found api key:" + app_key) self.keys[app_name] = app_key self.clients.append( Client(key=app_key, timeout=None, retry_timeout=40)) self.logger.info("Registered api key:" + app_key) def get_client(self) -> Client: return random.choice(self.clients) def check_if_nyc(self, address) -> str: if address is None: return "None" return str(address.lower() in self.nyc) def geocode(self, address): if [COLLECTION.GEO_CACHE, address] not in self.mongo: value = self.get_client().geocode(address) self.mongo.put(COLLECTION.GEO_CACHE, address, value) return value # self.logger.critical("cached") else: return self.mongo.get(COLLECTION.GEO_CACHE, address)
def __init__(self): self.mongo = MongoDict() self.logger = getLogger("GoogleMap") getLogger('requests.packages.urllib3.connectionpool').setLevel( logging.ERROR) file = os.path.join( os.path.dirname(os.path.abspath( inspect.getsourcefile(DummyClass))), 'api.key') # dynamically load key from api.key with open(file) as f: for line in f: [app_name, app_key] = line.strip().split(" ") self.logger.info("Found api key:" + app_key) self.keys[app_name] = app_key self.clients.append( Client(key=app_key, timeout=None, retry_timeout=40)) self.logger.info("Registered api key:" + app_key)
class WaybackTimePipeline(object): def __init__(self): self.mongo_dict = MongoDict() def process_item(self, item: TimeItem, spider): self.spider = spider self.add_item_to_db(item) return item def add_item_to_db(self, item: TimeItem) -> None: # self.mongo_dict.insert_item(COLLECTION.TEST, item) if [COLLECTION.OT_CATALOG, item['version_datetime']] not in self.mongo_dict: self.spider.logger.debug(item['version_datetime'] + " stored to DB") self.mongo_dict.put(COLLECTION.OT_CATALOG, item['version_datetime'], dict(item)) else: self.spider.logger.debug(item['version_datetime'] + " exist. Skipped") pass
import tablib as tablib from mongotable.mongo_dict import MongoDict mongo = MongoDict() data = tablib.Dataset() data.headers = ["camis", "place_id", 'formatted_address', 'lat', 'lng', 'county'] for entry in mongo.client["db"]['doh_geo'].find({}): if len(entry['value']) >= 1: entry_dict = entry['value'][0] else: continue row = [] row.append(entry["key"]) row.append(entry_dict['place_id']) row.append(entry_dict['formatted_address']) row.append(entry_dict['geometry']['location']['lat']) row.append(entry_dict['geometry']['location']['lng']) for entry in entry_dict['address_components']: if 'administrative_area_level_2' in entry['types']: row.append(entry['long_name']) if len(row) == 5: row.append("county not found") # for key in data.headers:
import tablib as tablib from mongotable.mongo_dict import MongoDict, COLLECTION mongo = MongoDict() data = tablib.Dataset() data.headers = [ "version_datetime", "version_datetime_string", 'entry_number', 'url' ] for entry in mongo.get_collection_iterator(COLLECTION.OT_CATALOG): entry_dict = entry['value'] row = [] for key in data.headers: row.append(entry_dict[key]) data.append(row) print(data.csv) with open("ot_catalog.csv", "w") as file: file.write(data.csv)
def __init__(self): self.mongo_dict = MongoDict()
def __init__(self): self.mongo_dict = MongoDict() self.gm = GoogleMap()
from mongotable.mongo_dict import MongoDict, COLLECTION mh = MongoDict() test = COLLECTION.TEST print([test, "doesnotexist"] in mh) mh.put(test, "a", "value1") mh.put(test, "a", "value2") mh.put(test, "b", {"ad": 123}) mh.put(test, "list", [{"ad": 123}])
def __init__(self, **kwargs): super(OTRestaurantsSpider, self).__init__(name=None, **kwargs) self.limit = 0 self.processed = 0 self.mongo = MongoDict() self.gm = GoogleMap()
class OTRestaurantsSpider(Spider): name = 'ot_restaurants_spider.py' allowed_domains = ['web.archive.org'] base_url = 'http://web.archive.org/' def __init__(self, **kwargs): super(OTRestaurantsSpider, self).__init__(name=None, **kwargs) self.limit = 0 self.processed = 0 self.mongo = MongoDict() self.gm = GoogleMap() def start_requests(self): limit = self.settings.get( 'DO_FIRST') if self.settings.get('DO_FIRST') >= 1 else 9999999 for entry in self.mongo.get_collection_iterator( COLLECTION.OT_CATALOG).sort('key', pymongo.ASCENDING).limit(limit): entry_dict = entry['value'] request = Request(url=entry_dict['url'], callback=self.parse_restaurant_page) request.meta['ot_catalog_key'] = entry['key'] + "_" + entry[ 'value']['entry_number'] # if request.meta['ot_catalog_key'] in self.mongo.client[self.settings.get("OUTPUT_DB")].collection_names(): # self.logger.critical(entry['key'] + " skipped") # continue # self.limit += 1 # if self.limit >= self.settings.get('LIMIT_CATALOG'): # return if entry['key'] != "20110720053652": self.logger.debug(entry['key'] + " skipped") continue self.logger.critical(entry['key'] + " REQUESTED") yield request def parse_restaurant_page(self, response: Response): self.logger.debug(response.meta['ot_catalog_key'] + " is received") for request in self.try_parse(response): yield request def try_parse(self, response: Response): selector = Selector(response) data_rows = selector.xpath('//tr[@class = "a" or @class = "r"]') if len(data_rows) == 0: data_rows = selector.xpath('//tr[contains(@class, "ResultRow")]') if len(data_rows) == 0: self.logger.error(response.url + " no data!") raise CloseSpider("data row is empty: " + response.url) self.logger.debug("Found: " + str(len(data_rows))) for row in data_rows: yield self.try_parse_row(row, response) def try_parse_row(self, row: Selector, response: Response): item = OTItem() item['ot_catalog_key'] = response.meta['ot_catalog_key'] # extract name item['name'] = row.xpath('.//a[@href]/text()').extract_first() # extract neighborhood neighborhood = row.xpath('.//div[@class="nn"]/text()').extract_first() if neighborhood is not None: item['neighborhood'] = neighborhood.strip() if 'neighborhood' not in item: neighborhood = row.xpath( './/div[@class="d"]/text()').extract_first() if neighborhood is not None: item['neighborhood'] = neighborhood.strip().split("|")[0] # extract type type_r = row.xpath('.//div[@class="nf"]/text()').extract_first() if type_r is not None: item['type'] = type_r.strip() if 'type' not in item: type_r = row.xpath('.//div[@class="d"]/text()').extract_first() if type_r is not None: item['type'] = type_r.strip().split("|")[1] # extract price price = row.xpath('.//td[@class="p"]/text()').extract_first() if price is not None: item["price"] = len(price) if 'price' not in item: price = row.xpath('.//td[@class="PrCol"]/text()').extract_first() if price is not None: item["price"] = len(price) # extract url url = row.xpath('.//a[@class="r"]/@href').extract_first() if url is not None: item['url'] = urljoin(response.url, url) if 'url' not in item: url = row.xpath('.//a[@href]/@href').extract_first() if url is not None: item['url'] = urljoin(response.url, url) # extract stars stars = row.xpath( './/div[@class="Ratings"]/div/@title').extract_first() if stars is not None: item['stars'] = [float(s) for s in stars.split() if is_float(s)][0] if stars else "-1" else: item['stars'] = -1 # extract reviews reviews = row.xpath( './/span[@class="reviews"]/preceding-sibling::text()' ).extract_first() if reviews is not None: item['reviews'] = int(reviews) else: item['reviews'] = -1 # request = Request(item['url'], callback=self.extract_geo_fields, dont_filter=True, errback=self.err_yield_item) request.meta['item'] = item return request def err_yield_item(self, response: Response): item = response.meta['item'] yield def extract_geo_fields(self, response: Response): item = response.meta['item'] selector = Selector(response) try: address = selector.xpath( '//li[@class="RestProfileAddressItem"]/text()').extract() # self.logger.error("(" + "".join(address) + ")") # self.logger.error(1) if len(address) == 0: address = selector.xpath( '//span[@id="RestSearch_lblFullAddress"]/text()').extract( ) # self.logger.error("(" + "".join(address) + ")") # self.logger.error(2) if len(address) == 0: address = selector.xpath( '//div[@class="RestProfileAddress"]/text()').extract() if len("".join(address).strip()) == 0: address = "" # self.logger.error("(" + "".join(address) + ")") # self.logger.error(3) if len(address) == 0: address = selector.xpath( '//span[@id="ProfileOverview_lblAddressText"]/text()' ).extract() # self.logger.error("(" + "".join(address) + ")") # self.logger.error(4) if len(address) == 0: address = selector.xpath( '//span[@itemprop="streetAddress"]/text()').extract() # self.logger.error("(" + "".join(address) + ")") # self.logger.error(5) if len(address) != 0: address = ",".join( [str(line).strip().replace('\"', '') for line in address]) item['address'] = address if len(address) == 0: raise KeyError # cleanup address to remove things in bracket # eg: 714 Seventh Avenue (inside Renaissance Hotel) start = item['address'].find('(') end = item['address'].find(')') if start != -1 and end != -1: item['address'] = item['address'][:start - 1] + item['address'][end + 1:] # extract geocode item['geocode'] = self.gm.geocode(item['address']) if len(item['geocode']) == 0: self.logger.error("geocode empty: " + item['address']) raise KeyError item['geocode'] = item['geocode'][0] # extract county item['county'] = self.extract_county( item['geocode']['address_components'], item) # extract place_id item['place_id'] = item['geocode']['place_id'] # set is_nyc item['is_nyc'] = self.gm.check_if_nyc(item['county']) item['extract_success'] = True except KeyError or IndexError: item['extract_success'] = False # self.logger.error("Extract failed. Saved anyway: " + str(item)) yield item def verify(self, item: OTItem, field: str, response: Response): if field not in item or item[field] is None: raise CloseSpider("extract field failed: " + field + " " + response.url) else: self.logger.debug("Success: " + str(item[field])) pass def extract_county(self, geocode, item): for entry in geocode: if 'administrative_area_level_2' in entry['types']: return entry['long_name'] self.logger.critical("County Not found: " + str(item))
# print(similar_score) (similar_score, (dist, key)) = pq2.get() value["name_similarity"] = similar_score value["match_camis"] = key value["match_distance"] = dist value["match_name"] = client['db']['doh_raw'].find( {'key': key})[0]['value'].strip().split(",")[1].lower() save_result(collection, ot_key, value) def save_result(c, key, value): output_db[c].update({'key': key}, to_dict(key, value), upsert=True) client = MongoDict().client doh_dict = init_doh() print(len(doh_dict)) db_name = "ot_db_try7_fulll_stepped" db = client[db_name] output_db = client[db_name + "_matched"] for c in db.collection_names(): if c != "20101206120344_1833": continue
OUTPUT_FILE = os.path.join( os.path.dirname(os.path.abspath(inspect.getsourcefile(DummyClass))), '3_address_dict.json') if __name__ == "__main__": gm = GoogleMap() progress = 0 query_counter = AtomicCounter() skipped_counter = AtomicCounter() total_counter = AtomicCounter() mongo_map = MongoDict() with open( "2_with_yearCol_Only_Useful_Col_DOHMH_New_York_City_Restaurant_Inspection_Results.csv", mode='r', encoding='utf-8') as originFile: with futures.ThreadPoolExecutor(max_workers=None) as executor: header = True for text in originFile: if header: header = False continue address = generate_address(text) camis = text.strip().split(",")[0]