Exemplo n.º 1
0
def insert(db, table, data=None, quiet=False):
	# Make `data` keys lowercase
	data = [ dict([ (k.lower(), stringify_if_unicode(row[k])) for k in row ]) for row in data ]
	# FastFail
	if data is None or not any([ any(record.values()) for record in data ]):
		query = "INSERT INTO {} VALUES ();".format(table)
	else:
		# FastFail
		if len(data) == 0:
			return None
		data_columns = set([])
		for row in data:
			if row:
				for key in row.keys():
					data_columns.add(key.lower())
		# print 'data', data
		# print 'data columns', data_columns
		# print 'table', table
		table_columns = get_columns(table)
		# print 'table columns', table_columns
		columns = data_columns & table_columns
		columns = list(columns)
		# print 'columns', columns
		values = []
		for row in data:
			valrow = []
			allblank = True
			for key in columns:
				# print row
				if row and key.lower() in row and row[key.lower()] is not None:
					valrow.append(row[key])
					allblank = False
				else:
					valrow.append('NULL')
			# if allblank:
			# 	values.append('-- Empty Record')
			else:
				values.append(str(tuple(valrow)))
		query = 'INSERT INTO {} ( `{}` ) VALUES {};'
		joiner = ', '
		if len(data) > 1:
			query = 'INSERT INTO {} ( `{}` ) VALUES \n\t{};'
			joiner = ',\n\t'
		values = joiner.join(values)
		query = query.format(table,'`, `'.join(columns),values)
		query = query.replace("'NULL'","NULL")
		query = query.replace(",)",")")
	MainQuery = db.cursor()
	LastIn = db.cursor()
	lastin = 'SELECT * FROM {} ORDER BY created_at DESC LIMIT {};'.format(table, len(data))
	if not quiet:
		print query
		print lastin
	MainQuery.execute(query)
	LastIn.execute(lastin)
	return lodify(LastIn)
 def __init__(self, db, name):
     self.db = db
     self.name = name
     get_columns = self.db.cursor()
     query = "DESCRIBE {table};".format(table=self.name)
     print query
     get_columns.execute(query)
     get_columns.close()
     # pretty(get_columns)
     self.columns = lodify(get_columns)
Exemplo n.º 3
0
def select_or_insert(db, table, **kwery):
	if 'quiet' in kwery:
		quiet = kwery['quiet']
		del kwery['quiet']
	else:
		quiet = False
	cursor = db.cursor()
	query = ', '.join([ '`'+kv[0]+'`='+stringify(kv[1]) for kv in kwery.items() ])
	query = 'SELECT * from {} WHERE {} LIMIT 1;'.format(table, query)
	if not quiet:
		print query
	cursor.execute(query)
	if cursor.rowcount:
		print 'yes'
		return lodify(cursor)
	else:
		print 'no'
		return insert(db, table, [kwery], quiet=quiet)
 def insertlod(self, lod):
     """Insert a List Of Dicts into table and return a List Of Dicts"""
     cursor = self.InsertLOD(lod)
     result = remove_access(lodify(cursor))
     cursor.close()
     return result
 def sim(self, field, values):
     cursor = self.SIM(field, values)
     result = remove_access(lodify(cursor))
     cursor.close()
     self.closeall()
     return result
 def select_or_insert(self, **data):
     cursor = self.SelectOrInsert(mand=data, opt={})
     result = remove_access(lodify(cursor))[0]
     cursor.close()
     return result
 def select(self, **kwery):
     cursor = self.Select(**kwery)
     result = remove_access(lodify(cursor))
     cursor.close()
     return result
def mine(url, cid, regex=r'^.*$', wid=None, quiet=False):
	soup = get_soup(url)
	if soup:

		if wid is None:
			widCursor = Webpage.SelectOrInsert(mand={'url':url}, opt={'newCID':cid})
			wid = lodify(widCursor)['WID']
			widCursor.close()
		else:
			SetNewCID = db.cursor()
			query = 'UPDATE Webpage SET newCID = {cid} WHERE WID = {wid};'
			query = query.format(cid=cid, wid=wid)
			print query
			SetNewCID.execute(query)
			SetNewCID.close()

		# Record the mined data
		oid = Observation.insert1(
			WID=wid, 
			CID=cid, 
			# html=unicode(soup.text), 
			# quiet=True,
		)['OID']
		parse_text(soup.text, oid)

		breakpoint(cid)
		consolidate_all_webpages()
		breakpoint(cid)

		# Record that this link has been mined
		Update = db.cursor()
		query = 'UPDATE Webpage SET mined=True WHERE wid IN ({});'.format(str(wid))
		if not quiet:
			print query
		Update.execute(query)
		Update.close()

		links = get_links(soup, url)
		links = set(filter(lambda link: re.match(regex, link), links))

		links = [ link.replace('"','%22') for link in links ]

		if links:
			Pages = Webpage.SIM('url', links)
			pages = remove_access(lodify(Pages))
			Pages.close()
			SetNewCID = db.cursor()
			query = 'UPDATE Webpage SET newCID={newCID} WHERE access=TRUE;'
			query = query.format(newCID=cid)
			SetNewCID.execute(query)
			Webpage.closeall()
			Link.insertlod([ {'fromWID':wid,'toWID':row['WID']} for row in pages ])
			breakpoint(cid)
		
		# UpdateWebpage = db.cursor()
		# query = 'UPDATE Webpage SET newCID={cid} WHERE wid IN ({wids});'.format(cid=CrawlID, wids=','.join(discovered_wids))
		# if not quiet:
		# 	print query
		# UpdateWebpage.execute(query)
		# UpdateWebpage.close()

			# exit()

		# if links:
		# 	SelectWID = db.cursor()
		# 	sqlinks = ','.join([ stringify(link.replace('"','%22')) for link in links ])
		# 	query = 'SELECT WID FROM Webpage WHERE url IN ({});'.format(sqlinks)
		# 	print query
		# 	SelectWID.execute(query)
		# 	print
		# 	already = set(SelectWID)
		# 	# pretty(SelectWID)
		# 	SelectWID.close()

		# 	links - already
		
		# 	exit()
		# 	Link.insertlod([ {'fromWID':wid,'toWID':row} for row in links ])

		# if links:
		# 	SelectURL = db.cursor()
		# 	sqlinks = ','.join([ stringify(link) for link in links ])
		# 	query = 'SELECT url FROM Webpage WHERE url IN ({});'.format(sqlinks)
		# 	if not quiet:
		# 		print query
		# 	SelectURL.execute(query)
		# 	SelectURL.close()

		# 	already = set([ row[0] for row in cursor ])
		# 	links -= already

		# if links:
		# 	sqlinks = ','.join([ stringify(link) for link in links ])
		# 	Webpage.insert([ {'url':str(url),'newCID':cid} for url in links ])

	return soup