def __init__(self, db_name='fsqexp'): self.gateway = APIGateway(access_token, 500, [client_id, client_secret], 5000) self.wrapper = APIWrapper(self.gateway) self.params = {'v': 20140713} self.cache = MongoDBCache(db=db_name)
class VenueExtractor(): """ Class to match venues to chains or other venues in a cache """ def __init__(self, db_name='fsqexp'): # access to the database self.cache = MongoDBCache(db=db_name) def extract_venues(self): venues = self.cache.get_collection('venues') labels = ['name', 'id', 'url', 'contact-twitter', 'contact-facebook', 'categories'] with open('min_venues.csv', 'w') as outfile: csv_writer = csv.DictWriter(outfile, labels) csv_writer.writeheader() i = 0 for v in venues: if i % 1000 == 0: print i if v.get('response'): v = v['response']['venue'] min_v = {} min_v['id'] = v['id'] min_v['name'] = v['name'].encode('utf-8') if v.get('url') : min_v['url'] = v['url'].encode('utf-8') else: min_v['url'] = "" if v.get('contact'): if v['contact'].get('twitter'): min_v['contact-twitter'] = v['contact']['twitter'].encode('utf-8') else: min_v['contact-twitter'] = "" if v['contact'].get('facebook'): min_v['contact-facebook'] = v['contact']['facebook'].encode('utf-8') else: min_v['contact-facebook'] = "" else: min_v['contact-twitter'] = "" min_v['contact-facebook'] = "" if v.get('categories'): min_v['categories'] = [] for c in v['categories']: min_v['categories'].append(c['id']) else: min_v['categories'] = [] csv_writer.writerow(min_v) i += 1
class VenueSearcher: def __init__(self, db_name='fsqexp'): self.gateway = APIGateway(access_token, 500, [client_id, client_secret], 5000) self.wrapper = APIWrapper(self.gateway) self.params = { 'v' : 20140713 } self.cache = MongoDBCache(db=db_name) def venue_has_chain_property(self, venue): if venue.get('page', None) is not None: if venue['page'].get('user', None) is not None: if venue['page']['user'].get('type', None) is not None: return venue['page']['user']['type'] == 'chain' return False def global_search(self, query, check_fresh=False): params = {} params['v'] = self.params['v'] params['intent'] = 'global' params['limit'] = 50 params['query'] = query if self.cache.document_exists('global_searches', {'params': params}, check_fresh): results = self.cache.get_document('global_searches', {'params': params}, check_fresh) return results['response']['venues'] else: try: results = self.wrapper.query_routine('venues', 'search', params, True) if not results is None: results['params'] = params self.cache.put_document('global_searches', results) return results['response']['venues'] except urllib2.HTTPError, e: pass except urllib2.URLError, e: pass
def __init__(self, db_name='fsqexp'): self.gateway = APIGateway(access_token, 500, [client_id, client_secret], 5000) self.wrapper = APIWrapper(self.gateway) self.params = { 'v' : 20140713 } self.cache = MongoDBCache(db=db_name)
class VenueSearcher: def __init__(self, db_name='fsqexp'): self.gateway = APIGateway(access_token, 500, [client_id, client_secret], 5000) self.wrapper = APIWrapper(self.gateway) self.params = {'v': 20140713} self.cache = MongoDBCache(db=db_name) def venue_has_chain_property(self, venue): if venue.get('page', None) is not None: if venue['page'].get('user', None) is not None: if venue['page']['user'].get('type', None) is not None: return venue['page']['user']['type'] == 'chain' return False def global_search(self, query, check_fresh=False): params = {} params['v'] = self.params['v'] params['intent'] = 'global' params['limit'] = 50 params['query'] = query if self.cache.document_exists('global_searches', {'params': params}, check_fresh): results = self.cache.get_document('global_searches', {'params': params}, check_fresh) return results['response']['venues'] else: try: results = self.wrapper.query_routine('venues', 'search', params, True) if not results is None: results['params'] = params self.cache.put_document('global_searches', results) return results['response']['venues'] except urllib2.HTTPError, e: pass except urllib2.URLError, e: pass
def __init__(self, db_name='fsqexp', required_chain_confidence=0.9, required_venue_confidence=0.95): # access to the database self.cache = MongoDBCache(db=db_name) # read venues from file self.csv_reader = csv.DictReader( codecs.open('min_venues.csv', 'r', 'utf-8')) # ChainManager handles chain operations self.cm = ChainManager(db_name=db_name) # category tools self.ct = CategoryTree() # value we use to decide if two venues should be matched together self.required_venue_confidence = required_venue_confidence # value we use to decide if a venue should be part of a chain self.required_chain_confidence = required_chain_confidence
class ChainManager: """ ChainManager is responsible for handling chain operations. """ def __init__(self, db_name='fsqexp'): self.cache = MongoDBCache(db=db_name) def create_chain(self, venues): chain = CachedChain(self.cache) for venue in venues: chain.add_venue(venue) chain.save() return chain def add_to_chain(self, chain_id, venues): chain = self.load_chain(chain_id) for venue in venues: chain.add_venue(venue) chain.save() return chain def delete_chain(self, chain): venues = chain.venues[:] for venue in venues: chain.remove_venue(venue) self.cache.remove_document('chains', {"_id": chain.id}) def merge_chains(self, chain1, chain2): venues = chain1.venues[:] + chain2.venues[:] self.delete_chain(chain1) self.delete_chain(chain2) return self.create_chain(venues) def load_chain(self, chain_id): chain = self.cache.get_document('chains', {"_id": chain_id}) c = CachedChain(self.cache) c._from_dict(chain) return c
def __init__(self, db_name='fsqexp', required_chain_confidence=0.9, required_venue_confidence=0.95): # access to the database self.cache = MongoDBCache(db=db_name) # read venues from file self.csv_reader = csv.DictReader(codecs.open('min_venues.csv', 'r', 'utf-8')) # ChainManager handles chain operations self.cm = ChainManager(db_name=db_name) # category tools self.ct = CategoryTree() # value we use to decide if two venues should be matched together self.required_venue_confidence = required_venue_confidence # value we use to decide if a venue should be part of a chain self.required_chain_confidence = required_chain_confidence
class LocalComparison(): def __init__(self): self.vs = VenueSearcher() self.cd = ChainDecider() self.db = MongoDBCache(db='fsqexp') def get_venue_ids(self): venues = [] # get all the venues from the database db_venues = self.db.get_collection('venues').find(timeout=False) # extract information needed for comparison for v in db_venues: # just work with venue information instead of whole response if v.get('response'): v = v['response']['venue'] venues.append(v['id']) return venues @venue_response def local_comparison(self, venue, radius): alt_chain_count = 0 chain_alternates = [] indie_alternates = [] alternates = self.vs.search_alternates(venue, radius) for alternate in alternates: v = self.vs.get_venue_json(alternate['id']) if v is not None: if v['id'] != venue['id']: chain_id = self.cd.is_chain(v) if chain_id is not None: alt_chain_count += 1 chain_alternates.append(v) else: indie_alternates.append(v) return chain_alternates, indie_alternates
class CacheChainMatcher(): """ Class to match venues to chains or other venues in a cache """ def __init__(self, db_name='fsqexp', required_chain_confidence=0.9, required_venue_confidence=0.95): # access to the database self.cache = MongoDBCache(db=db_name) # read venues from file self.csv_reader = csv.DictReader( codecs.open('min_venues.csv', 'r', 'utf-8')) # ChainManager handles chain operations self.cm = ChainManager(db_name=db_name) # category tools self.ct = CategoryTree() # value we use to decide if two venues should be matched together self.required_venue_confidence = required_venue_confidence # value we use to decide if a venue should be part of a chain self.required_chain_confidence = required_chain_confidence @venue_response def check_chain_lookup(self, venue): """ Checks for a venue lookup document to see if the venue has already been assigned to a chain """ chain_id = None if self.cache.document_exists('chain_id_lookup', {'_id': venue['id']}): chain_id = self.cache.get_document( 'chain_id_lookup', {'_id': venue['id']})['chain_id'] return chain_id @venue_response def check_existing_chains(self, venue): """ Check all existing chains to see if this venue should be added to one of them """ # get all existing chains chains = self.cache.get_collection('chains').find() # find the best match best_match, confidence = find_best_chain_match(venue, chains) if confidence >= self.required_chain_confidence: self.cm.add_to_chain(best_match['_id'], [venue]) return best_match['_id'] else: return None @venue_response def fuzzy_compare_to_cache(self, venue): chain_id = None venue_matches = [venue] # look at all the other venues that haven't already been compared # extract information about all the venues from the database # v_copy = self.cache.get_collection('venues').find(timeout=False) v_copy = csv.DictReader(codecs.open('min_venues.csv', 'r', 'utf-8')) count = 0 print("starting at %d" % self.i) for csv_v in v_copy: v = get_min_venue_from_csv(csv_v) if count > self.i: if venue['id'] != v['id']: # calculate match with this venue nd, um, sm, cm = calc_venue_match_confidence(venue, v) confidence = sum([nd, um, sm, cm]) if confidence > self.required_venue_confidence: venue_matches.append(v) count += 1 # have we found any matches? if len(venue_matches) <= 1: return None # are any matches already in a chain? chains = set() for v in venue_matches: chain_id = self.check_chain_lookup(v) if chain_id is not None: chains.add(chain_id) venue_matches.remove(v) # creating a new chain if len(chains) == 0: chain = self.cm.create_chain(venue_matches) chain_id = chain.id # adding to an existing chain elif len(chains) == 1: chain_id = list(chains)[0] chain = self.cm.add_to_chain(chain_id, venue_matches) # find best match out of many chains else: candidate_chains = [ self.cache.get_document('chains', {"_id": chain}) for chain in chains ] for v in venue_matches: chain, confidence = find_best_chain_match(v, candidate_chains) if confidence > self.required_chain_confidence: chain_id = chain['_id'] chain = self.cm.add_to_chain(chain_id, [v]) return chain_id def do_matching(self): # extract information about all the venues from the database # self.venues = self.cache.get_collection('venues').find(timeout=False) self.i = 0 for v in self.csv_reader: print(v) venue = get_min_venue_from_csv(v) print(self.i) chain_id = None # check if the venue is already in a chain chain_id = self.check_chain_lookup(venue) if chain_id is None: # compare the venue against existing chains chain_id = self.check_existing_chains(venue) if chain_id is None: # check the rest of the venues in the cache chain_id = self.fuzzy_compare_to_cache(venue) self.i += 1
class CacheChainMatcher(): """ Class to match venues to chains or other venues in a cache """ def __init__(self, db_name='fsqexp', required_chain_confidence=0.9, required_venue_confidence=0.95): # access to the database self.cache = MongoDBCache(db=db_name) # read venues from file self.csv_reader = csv.DictReader(codecs.open('min_venues.csv', 'r', 'utf-8')) # ChainManager handles chain operations self.cm = ChainManager(db_name=db_name) # category tools self.ct = CategoryTree() # value we use to decide if two venues should be matched together self.required_venue_confidence = required_venue_confidence # value we use to decide if a venue should be part of a chain self.required_chain_confidence = required_chain_confidence @venue_response def check_chain_lookup(self, venue): """ Checks for a venue lookup document to see if the venue has already been assigned to a chain """ chain_id = None if self.cache.document_exists('chain_id_lookup', {'_id': venue['id']}): chain_id = self.cache.get_document('chain_id_lookup', {'_id': venue['id']})['chain_id'] return chain_id @venue_response def check_existing_chains(self, venue): """ Check all existing chains to see if this venue should be added to one of them """ # get all existing chains chains = self.cache.get_collection('chains').find() # find the best match best_match, confidence = find_best_chain_match(venue, chains) if confidence >= self.required_chain_confidence: self.cm.add_to_chain(best_match['_id'], [venue]) return best_match['_id'] else: return None @venue_response def fuzzy_compare_to_cache(self, venue): chain_id = None venue_matches = [venue] # look at all the other venues that haven't already been compared # extract information about all the venues from the database # v_copy = self.cache.get_collection('venues').find(timeout=False) v_copy = csv.DictReader(codecs.open('min_venues.csv', 'r', 'utf-8')) count = 0 print("starting at %d" % self.i) for csv_v in v_copy: v = get_min_venue_from_csv(csv_v) if count > self.i: if venue['id'] != v['id']: # calculate match with this venue nd, um, sm, cm = calc_venue_match_confidence(venue, v) confidence = sum([nd, um, sm, cm]) if confidence > self.required_venue_confidence: venue_matches.append(v) count += 1 # have we found any matches? if len(venue_matches) <= 1: return None # are any matches already in a chain? chains = set() for v in venue_matches: chain_id = self.check_chain_lookup(v) if chain_id is not None: chains.add(chain_id) venue_matches.remove(v) # creating a new chain if len(chains) == 0: chain = self.cm.create_chain(venue_matches) chain_id = chain.id # adding to an existing chain elif len(chains) == 1: chain_id = list(chains)[0] chain = self.cm.add_to_chain(chain_id, venue_matches) # find best match out of many chains else: candidate_chains = [self.cache.get_document('chains', {"_id": chain}) for chain in chains] for v in venue_matches: chain, confidence = find_best_chain_match(v, candidate_chains) if confidence > self.required_chain_confidence: chain_id = chain['_id'] chain = self.cm.add_to_chain(chain_id, [v]) return chain_id def do_matching(self): # extract information about all the venues from the database # self.venues = self.cache.get_collection('venues').find(timeout=False) self.i = 0 for v in self.csv_reader: print(v) venue = get_min_venue_from_csv(v) print(self.i) chain_id = None # check if the venue is already in a chain chain_id = self.check_chain_lookup(venue) if chain_id is None: # compare the venue against existing chains chain_id = self.check_existing_chains(venue) if chain_id is None: # check the rest of the venues in the cache chain_id = self.fuzzy_compare_to_cache(venue) self.i += 1
def __init__(self, db_name='fsqexp'): self.cache = MongoDBCache(db=db_name)
def __init__(self): self.vs = VenueSearcher() self.cd = ChainDecider() self.db = MongoDBCache(db='fsqexp')
def __init__(self, db_name='fsqexp'): # access to the database self.cache = MongoDBCache(db=db_name)
class VenueExtractor(): """ Class to match venues to chains or other venues in a cache """ def __init__(self, db_name='fsqexp'): # access to the database self.cache = MongoDBCache(db=db_name) def extract_venues(self): venues = self.cache.get_collection('venues') labels = [ 'name', 'id', 'url', 'contact-twitter', 'contact-facebook', 'categories' ] with open('min_venues.csv', 'w') as outfile: csv_writer = csv.DictWriter(outfile, labels) csv_writer.writeheader() i = 0 for v in venues: if i % 1000 == 0: print i if v.get('response'): v = v['response']['venue'] min_v = {} min_v['id'] = v['id'] min_v['name'] = v['name'].encode('utf-8') if v.get('url'): min_v['url'] = v['url'].encode('utf-8') else: min_v['url'] = "" if v.get('contact'): if v['contact'].get('twitter'): min_v['contact-twitter'] = v['contact'][ 'twitter'].encode('utf-8') else: min_v['contact-twitter'] = "" if v['contact'].get('facebook'): min_v['contact-facebook'] = v['contact'][ 'facebook'].encode('utf-8') else: min_v['contact-facebook'] = "" else: min_v['contact-twitter'] = "" min_v['contact-facebook'] = "" if v.get('categories'): min_v['categories'] = [] for c in v['categories']: min_v['categories'].append(c['id']) else: min_v['categories'] = [] csv_writer.writerow(min_v) i += 1