Пример #1
0
    def __init__(self, db_name='fsqexp'):

        self.gateway = APIGateway(access_token, 500,
                                  [client_id, client_secret], 5000)
        self.wrapper = APIWrapper(self.gateway)

        self.params = {'v': 20140713}

        self.cache = MongoDBCache(db=db_name)
Пример #2
0
class VenueExtractor():
    """
    Class to match venues to chains or other venues in a cache
    """
    def __init__(self, db_name='fsqexp'):

        # access to the database
        self.cache = MongoDBCache(db=db_name)

    def extract_venues(self):

        venues = self.cache.get_collection('venues')
        labels = ['name', 'id', 'url', 'contact-twitter', 'contact-facebook', 'categories']
        
        with open('min_venues.csv', 'w') as outfile:
            csv_writer = csv.DictWriter(outfile, labels)
            csv_writer.writeheader()

            i = 0
            for v in venues:
                if i % 1000 == 0:
                    print i
                if v.get('response'):
                    v = v['response']['venue']

                min_v = {}
                min_v['id'] = v['id']
                min_v['name'] = v['name'].encode('utf-8')

                if v.get('url') :
                    min_v['url'] = v['url'].encode('utf-8')
                else:
                    min_v['url'] = ""
                
                if v.get('contact'):

                    if v['contact'].get('twitter'):
                        min_v['contact-twitter'] = v['contact']['twitter'].encode('utf-8')
                    else:
                        min_v['contact-twitter'] = ""

                    if v['contact'].get('facebook'):
                        min_v['contact-facebook'] = v['contact']['facebook'].encode('utf-8')
                    else:
                        min_v['contact-facebook'] = ""
                else:
                    min_v['contact-twitter'] = ""
                    min_v['contact-facebook'] = ""

                if v.get('categories'):
                    min_v['categories'] = []
                    for c in v['categories']:
                        min_v['categories'].append(c['id'])
                else:
                    min_v['categories'] = []

                csv_writer.writerow(min_v)
                i += 1
Пример #3
0
class VenueSearcher:

    def __init__(self, db_name='fsqexp'):
        
        self.gateway = APIGateway(access_token, 500, [client_id, client_secret], 5000)
        self.wrapper = APIWrapper(self.gateway)

        self.params = {
            'v' : 20140713
        }

        self.cache = MongoDBCache(db=db_name)


    def venue_has_chain_property(self, venue):
        if venue.get('page', None) is not None:
            if venue['page'].get('user', None) is not None:
                if venue['page']['user'].get('type', None) is not None:
                    return venue['page']['user']['type'] == 'chain'
        return False


    def global_search(self, query, check_fresh=False):

        params = {}
        params['v'] = self.params['v']
        params['intent'] = 'global'
        params['limit'] = 50
        params['query'] = query
        

        if self.cache.document_exists('global_searches', {'params': params}, check_fresh):
            results = self.cache.get_document('global_searches', {'params': params}, check_fresh)
            return results['response']['venues']
        else:
            try:
                results = self.wrapper.query_routine('venues', 'search', params, True)
                if not results is None:
                    results['params'] = params
                    self.cache.put_document('global_searches', results)
                return results['response']['venues']
            except urllib2.HTTPError, e:
                pass
            except urllib2.URLError, e:
                pass
Пример #4
0
    def __init__(self, db_name='fsqexp'):
        
        self.gateway = APIGateway(access_token, 500, [client_id, client_secret], 5000)
        self.wrapper = APIWrapper(self.gateway)

        self.params = {
            'v' : 20140713
        }

        self.cache = MongoDBCache(db=db_name)
Пример #5
0
class VenueSearcher:
    def __init__(self, db_name='fsqexp'):

        self.gateway = APIGateway(access_token, 500,
                                  [client_id, client_secret], 5000)
        self.wrapper = APIWrapper(self.gateway)

        self.params = {'v': 20140713}

        self.cache = MongoDBCache(db=db_name)

    def venue_has_chain_property(self, venue):
        if venue.get('page', None) is not None:
            if venue['page'].get('user', None) is not None:
                if venue['page']['user'].get('type', None) is not None:
                    return venue['page']['user']['type'] == 'chain'
        return False

    def global_search(self, query, check_fresh=False):

        params = {}
        params['v'] = self.params['v']
        params['intent'] = 'global'
        params['limit'] = 50
        params['query'] = query

        if self.cache.document_exists('global_searches', {'params': params},
                                      check_fresh):
            results = self.cache.get_document('global_searches',
                                              {'params': params}, check_fresh)
            return results['response']['venues']
        else:
            try:
                results = self.wrapper.query_routine('venues', 'search',
                                                     params, True)
                if not results is None:
                    results['params'] = params
                    self.cache.put_document('global_searches', results)
                return results['response']['venues']
            except urllib2.HTTPError, e:
                pass
            except urllib2.URLError, e:
                pass
    def __init__(self,
                 db_name='fsqexp',
                 required_chain_confidence=0.9,
                 required_venue_confidence=0.95):

        # access to the database
        self.cache = MongoDBCache(db=db_name)

        # read venues from file
        self.csv_reader = csv.DictReader(
            codecs.open('min_venues.csv', 'r', 'utf-8'))

        # ChainManager handles chain operations
        self.cm = ChainManager(db_name=db_name)
        # category tools
        self.ct = CategoryTree()

        # value we use to decide if two venues should be matched together
        self.required_venue_confidence = required_venue_confidence
        # value we use to decide if a venue should be part of a chain
        self.required_chain_confidence = required_chain_confidence
Пример #7
0
class ChainManager:
    """
    ChainManager is responsible for handling chain operations.
    """
    def __init__(self, db_name='fsqexp'):

        self.cache = MongoDBCache(db=db_name)

    def create_chain(self, venues):
        chain = CachedChain(self.cache)
        for venue in venues:
            chain.add_venue(venue)
        chain.save()
        return chain

    def add_to_chain(self, chain_id, venues):
        chain = self.load_chain(chain_id)
        for venue in venues:
            chain.add_venue(venue)
        chain.save()
        return chain

    def delete_chain(self, chain):
        venues = chain.venues[:]
        for venue in venues:
            chain.remove_venue(venue)
        self.cache.remove_document('chains', {"_id": chain.id})

    def merge_chains(self, chain1, chain2):
        venues = chain1.venues[:] + chain2.venues[:]
        self.delete_chain(chain1)
        self.delete_chain(chain2)
        return self.create_chain(venues)

    def load_chain(self, chain_id):
        chain = self.cache.get_document('chains', {"_id": chain_id})
        c = CachedChain(self.cache)
        c._from_dict(chain)
        return c
    def __init__(self, db_name='fsqexp', required_chain_confidence=0.9, required_venue_confidence=0.95):

        # access to the database
        self.cache = MongoDBCache(db=db_name)

        # read venues from file
        self.csv_reader = csv.DictReader(codecs.open('min_venues.csv', 'r', 'utf-8'))

        # ChainManager handles chain operations
        self.cm = ChainManager(db_name=db_name)
        # category tools
        self.ct = CategoryTree()

        # value we use to decide if two venues should be matched together
        self.required_venue_confidence = required_venue_confidence
        # value we use to decide if a venue should be part of a chain
        self.required_chain_confidence = required_chain_confidence
Пример #9
0
class LocalComparison():

    def __init__(self):

        self.vs = VenueSearcher()
        self.cd = ChainDecider()
        self.db = MongoDBCache(db='fsqexp')

    def get_venue_ids(self):
        venues = []
        # get all the venues from the database
        db_venues = self.db.get_collection('venues').find(timeout=False)
        
        # extract information needed for comparison
        for v in db_venues:
            # just work with venue information instead of whole response
            if v.get('response'):
                v = v['response']['venue']  
                
            venues.append(v['id'])
        return venues

    @venue_response
    def local_comparison(self, venue, radius):

        alt_chain_count = 0
        chain_alternates = []
        indie_alternates = []

        alternates = self.vs.search_alternates(venue, radius)
        for alternate in alternates:
            v = self.vs.get_venue_json(alternate['id'])
            if v is not None:
                if v['id'] != venue['id']:
                    chain_id = self.cd.is_chain(v)
                    if chain_id is not None:
                        alt_chain_count += 1
                        chain_alternates.append(v)
                    else:
                        indie_alternates.append(v)

        return chain_alternates, indie_alternates
Пример #10
0
class LocalComparison():
    def __init__(self):

        self.vs = VenueSearcher()
        self.cd = ChainDecider()
        self.db = MongoDBCache(db='fsqexp')

    def get_venue_ids(self):
        venues = []
        # get all the venues from the database
        db_venues = self.db.get_collection('venues').find(timeout=False)

        # extract information needed for comparison
        for v in db_venues:
            # just work with venue information instead of whole response
            if v.get('response'):
                v = v['response']['venue']

            venues.append(v['id'])
        return venues

    @venue_response
    def local_comparison(self, venue, radius):

        alt_chain_count = 0
        chain_alternates = []
        indie_alternates = []

        alternates = self.vs.search_alternates(venue, radius)
        for alternate in alternates:
            v = self.vs.get_venue_json(alternate['id'])
            if v is not None:
                if v['id'] != venue['id']:
                    chain_id = self.cd.is_chain(v)
                    if chain_id is not None:
                        alt_chain_count += 1
                        chain_alternates.append(v)
                    else:
                        indie_alternates.append(v)

        return chain_alternates, indie_alternates
class CacheChainMatcher():
    """
    Class to match venues to chains or other venues in a cache
    """
    def __init__(self,
                 db_name='fsqexp',
                 required_chain_confidence=0.9,
                 required_venue_confidence=0.95):

        # access to the database
        self.cache = MongoDBCache(db=db_name)

        # read venues from file
        self.csv_reader = csv.DictReader(
            codecs.open('min_venues.csv', 'r', 'utf-8'))

        # ChainManager handles chain operations
        self.cm = ChainManager(db_name=db_name)
        # category tools
        self.ct = CategoryTree()

        # value we use to decide if two venues should be matched together
        self.required_venue_confidence = required_venue_confidence
        # value we use to decide if a venue should be part of a chain
        self.required_chain_confidence = required_chain_confidence

    @venue_response
    def check_chain_lookup(self, venue):
        """
        Checks for a venue lookup document to see if the venue has already
        been assigned to a chain
        """
        chain_id = None
        if self.cache.document_exists('chain_id_lookup', {'_id': venue['id']}):
            chain_id = self.cache.get_document(
                'chain_id_lookup', {'_id': venue['id']})['chain_id']

        return chain_id

    @venue_response
    def check_existing_chains(self, venue):
        """
        Check all existing chains to see if this venue should be added to one of them
        """

        # get all existing chains
        chains = self.cache.get_collection('chains').find()
        # find the best match
        best_match, confidence = find_best_chain_match(venue, chains)

        if confidence >= self.required_chain_confidence:
            self.cm.add_to_chain(best_match['_id'], [venue])
            return best_match['_id']
        else:
            return None

    @venue_response
    def fuzzy_compare_to_cache(self, venue):

        chain_id = None

        venue_matches = [venue]

        # look at all the other venues that haven't already been compared
        # extract information about all the venues from the database
        # v_copy = self.cache.get_collection('venues').find(timeout=False)
        v_copy = csv.DictReader(codecs.open('min_venues.csv', 'r', 'utf-8'))

        count = 0

        print("starting at %d" % self.i)

        for csv_v in v_copy:

            v = get_min_venue_from_csv(csv_v)

            if count > self.i:

                if venue['id'] != v['id']:

                    # calculate match with this venue
                    nd, um, sm, cm = calc_venue_match_confidence(venue, v)
                    confidence = sum([nd, um, sm, cm])
                    if confidence > self.required_venue_confidence:
                        venue_matches.append(v)
            count += 1

        # have we found any matches?
        if len(venue_matches) <= 1:
            return None

        # are any matches already in a chain?
        chains = set()
        for v in venue_matches:
            chain_id = self.check_chain_lookup(v)
            if chain_id is not None:
                chains.add(chain_id)
                venue_matches.remove(v)

        # creating a new chain
        if len(chains) == 0:
            chain = self.cm.create_chain(venue_matches)
            chain_id = chain.id
        # adding to an existing chain
        elif len(chains) == 1:
            chain_id = list(chains)[0]
            chain = self.cm.add_to_chain(chain_id, venue_matches)
        # find best match out of many chains
        else:
            candidate_chains = [
                self.cache.get_document('chains', {"_id": chain})
                for chain in chains
            ]
            for v in venue_matches:
                chain, confidence = find_best_chain_match(v, candidate_chains)
                if confidence > self.required_chain_confidence:
                    chain_id = chain['_id']
                    chain = self.cm.add_to_chain(chain_id, [v])
        return chain_id

    def do_matching(self):

        # extract information about all the venues from the database
        # self.venues = self.cache.get_collection('venues').find(timeout=False)

        self.i = 0
        for v in self.csv_reader:

            print(v)

            venue = get_min_venue_from_csv(v)

            print(self.i)

            chain_id = None
            # check if the venue is already in a chain
            chain_id = self.check_chain_lookup(venue)
            if chain_id is None:
                # compare the venue against existing chains
                chain_id = self.check_existing_chains(venue)
                if chain_id is None:
                    # check the rest of the venues in the cache
                    chain_id = self.fuzzy_compare_to_cache(venue)
            self.i += 1
class CacheChainMatcher():
    """
    Class to match venues to chains or other venues in a cache
    """
    def __init__(self, db_name='fsqexp', required_chain_confidence=0.9, required_venue_confidence=0.95):

        # access to the database
        self.cache = MongoDBCache(db=db_name)

        # read venues from file
        self.csv_reader = csv.DictReader(codecs.open('min_venues.csv', 'r', 'utf-8'))

        # ChainManager handles chain operations
        self.cm = ChainManager(db_name=db_name)
        # category tools
        self.ct = CategoryTree()

        # value we use to decide if two venues should be matched together
        self.required_venue_confidence = required_venue_confidence
        # value we use to decide if a venue should be part of a chain
        self.required_chain_confidence = required_chain_confidence

    @venue_response
    def check_chain_lookup(self, venue):
        """
        Checks for a venue lookup document to see if the venue has already
        been assigned to a chain
        """
        chain_id = None
        if self.cache.document_exists('chain_id_lookup', {'_id': venue['id']}):
            chain_id = self.cache.get_document('chain_id_lookup', {'_id': venue['id']})['chain_id']

        return chain_id

    @venue_response
    def check_existing_chains(self, venue):
        """
        Check all existing chains to see if this venue should be added to one of them
        """

        # get all existing chains
        chains = self.cache.get_collection('chains').find()
        # find the best match
        best_match, confidence = find_best_chain_match(venue, chains)

        if confidence >= self.required_chain_confidence:
            self.cm.add_to_chain(best_match['_id'], [venue])
            return best_match['_id']
        else:
            return None

    @venue_response
    def fuzzy_compare_to_cache(self, venue):

        chain_id = None

        venue_matches = [venue]

        # look at all the other venues that haven't already been compared
        # extract information about all the venues from the database
        # v_copy = self.cache.get_collection('venues').find(timeout=False)
        v_copy = csv.DictReader(codecs.open('min_venues.csv', 'r', 'utf-8'))

        count = 0

        print("starting at %d" % self.i)

        for csv_v in v_copy:

            v = get_min_venue_from_csv(csv_v)

            if count > self.i:

                if venue['id'] != v['id']:

                    # calculate match with this venue
                    nd, um, sm, cm = calc_venue_match_confidence(venue, v)
                    confidence = sum([nd, um, sm, cm])
                    if confidence > self.required_venue_confidence:
                        venue_matches.append(v)
            count += 1

        # have we found any matches?
        if len(venue_matches) <= 1:
            return None

        # are any matches already in a chain?
        chains = set()
        for v in venue_matches:
            chain_id = self.check_chain_lookup(v)
            if chain_id is not None:
                chains.add(chain_id)
                venue_matches.remove(v)

        # creating a new chain
        if len(chains) == 0:
            chain = self.cm.create_chain(venue_matches)
            chain_id = chain.id
        # adding to an existing chain
        elif len(chains) == 1:
            chain_id = list(chains)[0]
            chain = self.cm.add_to_chain(chain_id, venue_matches)
        # find best match out of many chains
        else:
            candidate_chains = [self.cache.get_document('chains', {"_id": chain}) for chain in chains]
            for v in venue_matches:
                chain, confidence = find_best_chain_match(v, candidate_chains)
                if confidence > self.required_chain_confidence:
                    chain_id = chain['_id']
                    chain = self.cm.add_to_chain(chain_id, [v])
        return chain_id

    def do_matching(self):

        # extract information about all the venues from the database
        # self.venues = self.cache.get_collection('venues').find(timeout=False)

        self.i = 0
        for v in self.csv_reader:

            print(v)

            venue = get_min_venue_from_csv(v)

            print(self.i)
            
            chain_id = None
            # check if the venue is already in a chain
            chain_id = self.check_chain_lookup(venue)
            if chain_id is None:
                # compare the venue against existing chains
                chain_id = self.check_existing_chains(venue)
                if chain_id is None:
                    # check the rest of the venues in the cache
                    chain_id = self.fuzzy_compare_to_cache(venue)
            self.i += 1     
Пример #13
0
    def __init__(self, db_name='fsqexp'):

        self.cache = MongoDBCache(db=db_name)
Пример #14
0
    def __init__(self):

        self.vs = VenueSearcher()
        self.cd = ChainDecider()
        self.db = MongoDBCache(db='fsqexp')
Пример #15
0
    def __init__(self, db_name='fsqexp'):

        # access to the database
        self.cache = MongoDBCache(db=db_name)
Пример #16
0
class VenueExtractor():
    """
    Class to match venues to chains or other venues in a cache
    """
    def __init__(self, db_name='fsqexp'):

        # access to the database
        self.cache = MongoDBCache(db=db_name)

    def extract_venues(self):

        venues = self.cache.get_collection('venues')
        labels = [
            'name', 'id', 'url', 'contact-twitter', 'contact-facebook',
            'categories'
        ]

        with open('min_venues.csv', 'w') as outfile:
            csv_writer = csv.DictWriter(outfile, labels)
            csv_writer.writeheader()

            i = 0
            for v in venues:
                if i % 1000 == 0:
                    print i
                if v.get('response'):
                    v = v['response']['venue']

                min_v = {}
                min_v['id'] = v['id']
                min_v['name'] = v['name'].encode('utf-8')

                if v.get('url'):
                    min_v['url'] = v['url'].encode('utf-8')
                else:
                    min_v['url'] = ""

                if v.get('contact'):

                    if v['contact'].get('twitter'):
                        min_v['contact-twitter'] = v['contact'][
                            'twitter'].encode('utf-8')
                    else:
                        min_v['contact-twitter'] = ""

                    if v['contact'].get('facebook'):
                        min_v['contact-facebook'] = v['contact'][
                            'facebook'].encode('utf-8')
                    else:
                        min_v['contact-facebook'] = ""
                else:
                    min_v['contact-twitter'] = ""
                    min_v['contact-facebook'] = ""

                if v.get('categories'):
                    min_v['categories'] = []
                    for c in v['categories']:
                        min_v['categories'].append(c['id'])
                else:
                    min_v['categories'] = []

                csv_writer.writerow(min_v)
                i += 1
Пример #17
0
    def __init__(self, db_name='fsqexp'):

        # access to the database
        self.cache = MongoDBCache(db=db_name)
Пример #18
0
    def __init__(self):

        self.vs = VenueSearcher()
        self.cd = ChainDecider()
        self.db = MongoDBCache(db='fsqexp')