class FreebaseDump:
	""" Functions to mine the Freebase dump
	"""

	NUMBER_ROWS_TO_FETCH = 100
	UPPER_LIMIT_M_0 = 'm.1'
	UPPER_LIMIT_N_0 = 'n.1'
	UPPER_LIMIT_G_0 = 'g.1'

	def __init__(self, fb_id):
		self.fb_id = fb_id
		self.DB = DB().connect()

	def run_save_script(self, id_last, upper_limit):
		""" Run batch save script
		@return [str] last_id
		"""
		fb_ids = self.get_batch_ids(id_last, upper_limit)
		if not fb_ids:
			return False

		for fb_id in fb_ids:
			single_data = self.get_single_data(fb_id)
			good_data = self.parse_out_wanted_data(single_data)
			if good_data:
				self.save_new_data(good_data)

		if fb_ids:
			# last id of batch
			return fb_ids[-1:][0]
		else:
			return None

	def get_single_data(self, fb_id):
		""" Get data from a single fb_id
		@return [tuple]
		"""
		if not fb_id:
			return False

		sql = 'SELECT * FROM freebase_dump WHERE fb_id = %s LIMIT 1;'
		self.DB.execute(sql, (fb_id,))
		data = self.DB.fetchone()

		return data

	def get_batch_ids(self, id_last, upper_limit):
		""" Get a batch of ids
		@return [list]
		"""
		if not id_last:
			return False
		if not upper_limit:
			upper_limit = 'z'

		sql = 'SELECT fb_id FROM freebase_dump WHERE fb_id > %s AND fb_id < %s ORDER BY fb_id ASC LIMIT ' + \
			str( self.NUMBER_ROWS_TO_FETCH ) + ';'

		self.DB.execute(sql, (id_last, upper_limit,))

		data = self.DB.fetchall()
		fb_ids = [d[0] for d in data]

		if fb_ids:
			return fb_ids
		else:
			return None

	def parse_out_wanted_data(self, data):
		""" Parse out wanted data: company_name and domain
		company_name = "rdf-schema#label.en", "rdf-schema#label"
		website = "common.topic.official_website"
		@return [dict]
		"""
		if not data:
			return False
		if type(data) is not tuple:
			return False

		fb_id = data[0]
		wants = ('business', 'company', 'companies', 'corporation', 'corporated')

		if any(needle in data[1].lower() for needle in wants):
			try:
				udata = unserialize(data[1])
			except:
				return False

			good_data = {}

			if 'rdf-schema#label.en' in udata:
				good_data['company_name'] = udata['rdf-schema#label.en']
			elif 'rdf-schema#label' in udata:
				good_data[ 'company_name' ] = udata[ 'rdf-schema#label' ]

			if 'common.topic.official_website' in udata:
				good_data[ 'website' ] = udata[ 'common.topic.official_website' ][0]

			if 'website' in good_data:
				good_data['fb_id'] = fb_id
				return good_data
		else:
			return False

	def save_new_data(self, data_dict):
		""" Saves new data
		@return [bool]
		"""
		if not data_dict:
			return False
		if type(data_dict) is not dict:
			return False

		insert_sql = "INSERT INTO freebase_companies (fbc_id, fbc_company_name, fbc_domain) VALUES(%s, %s, %s);";

		if all(keys in data_dict for keys in ('fb_id', 'company_name', 'website')):
			try:
				self.DB.execute(insert_sql, (data_dict['fb_id'], data_dict['company_name'], data_dict['website']))
				print self.DB.mogrify(insert_sql, (data_dict['fb_id'], data_dict['company_name'], data_dict['website']))
				print self.DB.statusmessage
				return True
			except:
				print 'Failed insert, already exists: ' + data_dict['fb_id']
		else:
			print 'Failed insert, not enough data.'
			print data_dict

		return False
示例#2
0
class Freebase:

	def __init__(self, fbc_id):
		self.fbc_id = fbc_id
		self.DB = DB().connect()

	def get_single_data(self, fbc_id):
		""" Get data from a single fbc_id
		@return [tuple]
		"""
		if not fbc_id:
			return False

		sql = 'SELECT * FROM freebase_companies WHERE fbc_id = %s LIMIT 1;'
		self.DB.execute(sql, (fbc_id,))
		data = self.DB.fetchone()

		return data

	def get_next_data(self, last_fbc_id):
		""" Get data from the fbc_id after current
		@return [tuple]
		"""
		if not last_fbc_id:
			return False

		sql = 'SELECT * FROM freebase_companies WHERE fbc_id > %s ORDER BY fbc_id ASC LIMIT 1;'
		self.DB.execute(sql, (last_fbc_id,))
		data = self.DB.fetchone()

		return data

	def parse_out_domain(self, full_url):
		""" Parse out http:// and return clean domain
		@return [str]
		"""
		if not full_url:
			return False

		try:
			parsed_url = urlparse(full_url)
		except:
			return False
		if parsed_url.netloc:
			if 'www.' in parsed_url.netloc[:4]:
				return parsed_url.netloc[4:]
			else:
				return parsed_url.netloc
		return False

	def run_parse_all_domains(self, last_fbc_id):
		""" Loop through all fbc_ids and parse out domain with script:
				Fb = Freebase('');
				last_id = 'm'
				for x in range(1,300000):
					last_id = Fb.run_parse_all_domains(last_id)

		@return [str] last_fbc_id
		"""
		if not last_fbc_id:
			return False

		update_sql = "UPDATE freebase_companies SET fbc_domain = %s WHERE fbc_id = %s;";

		data = Fb.get_next_data(last_fbc_id);
		if data:
			domain = Fb.parse_out_domain(data['fbc_full_url'])
			if domain:
				try:
					# self.DB.execute(update_sql, (domain, data['fbc_id']))
					print self.DB.mogrify(update_sql, (domain, data['fbc_id']))
					print self.DB.statusmessage
				except:
					print 'Failed insert, already exists: ' + data['fbc_id']
			return data['fbc_id']
		return False

	def run_clean_all_domains(self, last_fbc_id):
		""" Loop through all fbc_ids and set domain = '' with script:
				Fb = Freebase('');
				last_id = '0'
				for x in range(1,300000):
					last_id = Fb.run_clean_all_domains(last_id)

		@return [str] last_fbc_id
		"""
		if not last_fbc_id:
			return False

		update_sql = "UPDATE freebase_companies SET fbc_domain = '' WHERE fbc_id = %s;"

		data = Fb.get_next_data(last_fbc_id);

		if data and data['fbc_full_url']:
			try:
				parsed_url = urlparse(data['fbc_full_url'])
			except:
				return data['fbc_id']
			if len(parsed_url.path.strip()) > 1:
				try:
					# self.DB.execute(update_sql, (data['fbc_id'],))
					print self.DB.mogrify(update_sql, (data['fbc_id'],))
					print self.DB.statusmessage
				except:
					print self.DB.mogrify(update_sql, (data['fbc_id'],))
					print 'Failed update, for: ' + data['fbc_id']

			return data['fbc_id']
		return False