class FreebaseDump: """ Functions to mine the Freebase dump """ NUMBER_ROWS_TO_FETCH = 100 UPPER_LIMIT_M_0 = 'm.1' UPPER_LIMIT_N_0 = 'n.1' UPPER_LIMIT_G_0 = 'g.1' def __init__(self, fb_id): self.fb_id = fb_id self.DB = DB().connect() def run_save_script(self, id_last, upper_limit): """ Run batch save script @return [str] last_id """ fb_ids = self.get_batch_ids(id_last, upper_limit) if not fb_ids: return False for fb_id in fb_ids: single_data = self.get_single_data(fb_id) good_data = self.parse_out_wanted_data(single_data) if good_data: self.save_new_data(good_data) if fb_ids: # last id of batch return fb_ids[-1:][0] else: return None def get_single_data(self, fb_id): """ Get data from a single fb_id @return [tuple] """ if not fb_id: return False sql = 'SELECT * FROM freebase_dump WHERE fb_id = %s LIMIT 1;' self.DB.execute(sql, (fb_id,)) data = self.DB.fetchone() return data def get_batch_ids(self, id_last, upper_limit): """ Get a batch of ids @return [list] """ if not id_last: return False if not upper_limit: upper_limit = 'z' sql = 'SELECT fb_id FROM freebase_dump WHERE fb_id > %s AND fb_id < %s ORDER BY fb_id ASC LIMIT ' + \ str( self.NUMBER_ROWS_TO_FETCH ) + ';' self.DB.execute(sql, (id_last, upper_limit,)) data = self.DB.fetchall() fb_ids = [d[0] for d in data] if fb_ids: return fb_ids else: return None def parse_out_wanted_data(self, data): """ Parse out wanted data: company_name and domain company_name = "rdf-schema#label.en", "rdf-schema#label" website = "common.topic.official_website" @return [dict] """ if not data: return False if type(data) is not tuple: return False fb_id = data[0] wants = ('business', 'company', 'companies', 'corporation', 'corporated') if any(needle in data[1].lower() for needle in wants): try: udata = unserialize(data[1]) except: return False good_data = {} if 'rdf-schema#label.en' in udata: good_data['company_name'] = udata['rdf-schema#label.en'] elif 'rdf-schema#label' in udata: good_data[ 'company_name' ] = udata[ 'rdf-schema#label' ] if 'common.topic.official_website' in udata: good_data[ 'website' ] = udata[ 'common.topic.official_website' ][0] if 'website' in good_data: good_data['fb_id'] = fb_id return good_data else: return False def save_new_data(self, data_dict): """ Saves new data @return [bool] """ if not data_dict: return False if type(data_dict) is not dict: return False insert_sql = "INSERT INTO freebase_companies (fbc_id, fbc_company_name, fbc_domain) VALUES(%s, %s, %s);"; if all(keys in data_dict for keys in ('fb_id', 'company_name', 'website')): try: self.DB.execute(insert_sql, (data_dict['fb_id'], data_dict['company_name'], data_dict['website'])) print self.DB.mogrify(insert_sql, (data_dict['fb_id'], data_dict['company_name'], data_dict['website'])) print self.DB.statusmessage return True except: print 'Failed insert, already exists: ' + data_dict['fb_id'] else: print 'Failed insert, not enough data.' print data_dict return False
class Freebase: def __init__(self, fbc_id): self.fbc_id = fbc_id self.DB = DB().connect() def get_single_data(self, fbc_id): """ Get data from a single fbc_id @return [tuple] """ if not fbc_id: return False sql = 'SELECT * FROM freebase_companies WHERE fbc_id = %s LIMIT 1;' self.DB.execute(sql, (fbc_id,)) data = self.DB.fetchone() return data def get_next_data(self, last_fbc_id): """ Get data from the fbc_id after current @return [tuple] """ if not last_fbc_id: return False sql = 'SELECT * FROM freebase_companies WHERE fbc_id > %s ORDER BY fbc_id ASC LIMIT 1;' self.DB.execute(sql, (last_fbc_id,)) data = self.DB.fetchone() return data def parse_out_domain(self, full_url): """ Parse out http:// and return clean domain @return [str] """ if not full_url: return False try: parsed_url = urlparse(full_url) except: return False if parsed_url.netloc: if 'www.' in parsed_url.netloc[:4]: return parsed_url.netloc[4:] else: return parsed_url.netloc return False def run_parse_all_domains(self, last_fbc_id): """ Loop through all fbc_ids and parse out domain with script: Fb = Freebase(''); last_id = 'm' for x in range(1,300000): last_id = Fb.run_parse_all_domains(last_id) @return [str] last_fbc_id """ if not last_fbc_id: return False update_sql = "UPDATE freebase_companies SET fbc_domain = %s WHERE fbc_id = %s;"; data = Fb.get_next_data(last_fbc_id); if data: domain = Fb.parse_out_domain(data['fbc_full_url']) if domain: try: # self.DB.execute(update_sql, (domain, data['fbc_id'])) print self.DB.mogrify(update_sql, (domain, data['fbc_id'])) print self.DB.statusmessage except: print 'Failed insert, already exists: ' + data['fbc_id'] return data['fbc_id'] return False def run_clean_all_domains(self, last_fbc_id): """ Loop through all fbc_ids and set domain = '' with script: Fb = Freebase(''); last_id = '0' for x in range(1,300000): last_id = Fb.run_clean_all_domains(last_id) @return [str] last_fbc_id """ if not last_fbc_id: return False update_sql = "UPDATE freebase_companies SET fbc_domain = '' WHERE fbc_id = %s;" data = Fb.get_next_data(last_fbc_id); if data and data['fbc_full_url']: try: parsed_url = urlparse(data['fbc_full_url']) except: return data['fbc_id'] if len(parsed_url.path.strip()) > 1: try: # self.DB.execute(update_sql, (data['fbc_id'],)) print self.DB.mogrify(update_sql, (data['fbc_id'],)) print self.DB.statusmessage except: print self.DB.mogrify(update_sql, (data['fbc_id'],)) print 'Failed update, for: ' + data['fbc_id'] return data['fbc_id'] return False