def setAliasesIfNeeded(idstring, data = {}, reset = False): #parse idstring type, key = idstring.split(':')[0], ':'.join(idstring.split(':')[1:]) assert type in ['food', 'band', 'bandcollection', 'person'] #load the file in which to store information for the type of idstring path = os.path.join(econtains_data_root, '{0}.pickle'.format(type)) if not os.path.isdir(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) if os.path.isfile(path): content = pickle.load(open(path)) else: content = {} #use freebase to gather info for idstring only if needed.a if (not reset) and content.has_key(key): return {'count':len(content[key]), 'written': False, 'aliases':content[key]} else: data.update(dict(name = key)) aliases = fbu.fetch_type(type, **data) content[key] = aliases pickle.dump(content, open(path,'w')) return {'count':len(aliases), 'written':True, 'aliases':content[key]}
def run(alltweets, freebase_type = 'band', **kwargs): ''' yield a list of statistically significant tweets. cross check it against a set of aliases generated by freebase. called with a freebase type, it will call the fetch_type routine provided by freebase.py in order to grab a list of aliases to match against discovered terms. ''' counts = count(tokenize(alltweets)) long_keys = set([k for k in counts.keys() if len(k)>=5]) freebase_aliases = fbu.fetch_type(freebase_type, **mem.sr(kwargs)) matched = [ a.lower() for a in freebase_aliases if (a.lower() in long_keys)] raise Exception()