def checkRecord(self, dbt_name, md5='', id=0): # check which key to use if md5 == '' and id == 0: # a keyword should be provided raise Exception("You do not provide a key") elif md5 != '' and id > 0: # only one keyword raise Exception("Only one key is needed") elif md5 != '': q = " WHERE md5='" + md5 + "'" elif id > 0: q = " WHERE id=" + str(id) try: self.update_doc_lock.acquire() cursor = connection.cursor() dbquery = "SELECT * FROM " + dbt_name + q cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) if not rows: recordExist = False else: recordExist = True finally: self.update_doc_lock.release() return recordExist
def save_doc(self, r, data, pid): # save db first to get the id db_entry_updated = False # save a new copy of file only if it is new or an updated version file_updated = True try: self.update_doc_lock.acquire() cursor = connection.cursor() dbquery = "SELECT * FROM "+runconfig.dbt_document+\ " WHERE md5='"+r.md5+"'" cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) # insert a new record # note that the cursor cannot convert a Python "None" to # a MySQL NULL value. parent_idstr = str(pid) if str(pid) != 'None' else None if not rows: dbquery = "INSERT INTO "+runconfig.dbt_document+\ "(url,md5,host,rev_host,content_sha1,discover_date,update_date,parent_id,submission_id,state)"+\ " VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" dbquerypar = (r.url, r.md5, r.host, r.host[::-1], r.content_sha1, str(r.crawl_date), str(r.crawl_date), parent_idstr, str(r.batch), '0') #print dbquery % dbquerypar try: cursor.execute(dbquery, dbquerypar) except TypeError, e: print 'output.py. TypeError. dbquery = ', dbquery % dbquerypar raise SystemExit(e) except _mysql_exceptions.OperationalError, e: print 'output.py. MySQL Operationl Error. dbquery = ', dbquery % dbquerypar raise SystemExit(e)
def checkRecord(self,dbt_name,md5='',id=0): # check which key to use if md5 == '' and id == 0: # a keyword should be provided raise Exception("You do not provide a key") elif md5 != '' and id >0: # only one keyword raise Exception("Only one key is needed") elif md5 != '': q = " WHERE md5='"+md5+"'" elif id >0: q = " WHERE id="+str(id) try: self.update_doc_lock.acquire() cursor = connection.cursor() dbquery = "SELECT * FROM "+dbt_name+q cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) if not rows: recordExist = False else: recordExist = True finally: self.update_doc_lock.release() return recordExist
def save_doc(self, r, data, pid): # save db first to get the id db_entry_updated = False # save a new copy of file only if it is new or an updated version file_updated = True try: self.update_doc_lock.acquire() cursor = connection.cursor() dbquery = "SELECT * FROM "+runconfig.dbt_document+\ " WHERE md5='"+r.md5+"'" cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) # insert a new record # note that the cursor cannot convert a Python "None" to # a MySQL NULL value. parent_idstr = str(pid) if str(pid) != 'None' else None if not rows: dbquery = "INSERT INTO "+runconfig.dbt_document+\ "(url,md5,host,rev_host,content_sha1,discover_date,update_date,parent_id,submission_id,state)"+\ " VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" dbquerypar = (r.url,r.md5,r.host,r.host[::-1],r.content_sha1,str(r.crawl_date),str(r.crawl_date),parent_idstr,str(r.batch),'0') #print dbquery % dbquerypar try: cursor.execute(dbquery,dbquerypar) except TypeError,e: print 'output.py. TypeError. dbquery = ',dbquery % dbquerypar raise SystemExit(e) except _mysql_exceptions.OperationalError,e: print 'output.py. MySQL Operationl Error. dbquery = ',dbquery % dbquerypar raise SystemExit(e)
def dbcheck(self, r): try: self.update_doc_lock.acquire() cursor = connection.cursor() dbquery = "SELECT * FROM "+runconfig.dbt_document+\ " WHERE md5='"+r.md5+"'" cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) if not rows: return False else: row = rows[0] self.docid = row['id'] finally: self.update_doc_lock.release()
def dbcheck(self,r): try: self.update_doc_lock.acquire() cursor = connection.cursor() dbquery = "SELECT * FROM "+runconfig.dbt_document+\ " WHERE md5='"+r.md5+"'" cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) if not rows: return False else: row = rows[0] self.docid = row['id'] finally: self.update_doc_lock.release()
def queryDocID(self, dbquery): docids = [] try: self.update_doc_lock.acquire() cursor = connection.cursor() cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) if not rows: return docids finally: self.update_doc_lock.release() for row in rows: docids.append(row['id']) return docids
def queryDocID(self,dbquery): docids = [] try: self.update_doc_lock.acquire() cursor = connection.cursor() cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) if not rows: return docids finally: self.update_doc_lock.release() for row in rows: docids.append(row['id']) return docids
def save(self, r, data): # save parent first (even fail to save doc) if r.parent_url != None: try: self.update_parent_lock.acquire() cursor = connection.cursor() dbquery = "SELECT * FROM " + runconfig.dbt_parenturl + " WHERE md5='" + r.parent_md5 + "'" cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) # insert a new parent URL if not rows: dbquery = "INSERT INTO "+runconfig.dbt_parenturl+\ " (url,md5,first_crawl_date,last_crawl_date,is_live) "+\ " VALUES (%s,%s,%s,%s,%s)" dbquerypar = (r.parent_url.decode('utf8'), r.parent_md5, str(r.crawl_date), str(r.crawl_date), '1') cursor.execute(dbquery, dbquerypar) pid = cursor.lastrowid #transaction.commit_unless_managed() # update an existing parent URL else: row = rows[0] pid = row['id'] if r.crawl_date > row['last_crawl_date']: dbquery = "UPDATE "+runconfig.dbt_parenturl+\ " SET last_crawl_date='"+str(r.crawl_date)+\ "' WHERE id="+str(row['id'])+";" cursor.execute(dbquery) #transaction.commit_unless_managed() finally: self.update_parent_lock.release() else: pid = None # save document # rpid is the parent id of the resource URL # if the resource URL is a seed, rpid is None # other wise, it is the id in the parent url table. self.save_doc(r, data, pid)
def save(self, r, data): # save parent first (even fail to save doc) if r.parent_url != None: try: self.update_parent_lock.acquire() cursor = connection.cursor() dbquery = "SELECT * FROM "+runconfig.dbt_parenturl+" WHERE md5='"+r.parent_md5+"'" cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) # insert a new parent URL if not rows: dbquery = "INSERT INTO "+runconfig.dbt_parenturl+\ " (url,md5,first_crawl_date,last_crawl_date,is_live) "+\ " VALUES (%s,%s,%s,%s,%s)" dbquerypar = (r.parent_url.decode('utf8'),r.parent_md5,str(r.crawl_date),str(r.crawl_date),'1') cursor.execute(dbquery,dbquerypar) pid = cursor.lastrowid transaction.commit_unless_managed() # update an existing parent URL else: row = rows[0] pid = row['id'] if r.crawl_date > row['last_crawl_date']: dbquery = "UPDATE "+runconfig.dbt_parenturl+\ " SET last_crawl_date='"+str(r.crawl_date)+\ "' WHERE id="+str(row['id'])+";" cursor.execute(dbquery) transaction.commit_unless_managed() finally: self.update_parent_lock.release() else: pid = None # save document # rpid is the parent id of the resource URL # if the resource URL is a seed, rpid is None # other wise, it is the id in the parent url table. self.save_doc(r, data, pid)
def startup(verbose=False): # create on-screen information print object infoprinter = printinfo.printInfo() # define counters counters = counter.Counter() counters.newCounter('all') counters.setCounter('all',0) counters.newCounter('healthy') counters.newCounter('inrepo') # create output directory if it does not exist if not os.path.exists(dochealthcheck_config.outputdir): os.makedirs(dochealthcheck_config.outputdir) # create database object cdb = crawldb.CrawlDB() # print database names infoprinter.printPara('Database name',cdb.dbname) # create lock object update_doc_lock = threading.Lock() try: update_doc_lock.acquire() cursor = connection.cursor() # select documents to check #dbquery = "SELECT id FROM "+dbt_name+" WHERE submission_id=-2" dbquery = "SELECT id FROM "+dochealthcheck_config.dbt_name+" WHERE submission_id=-2" print dbquery cursor.execute(dbquery) rows = cursorutils.dictfetchall(cursor) if not rows: recordExist = False infoprinter.printPara('Number of records',str(0)) return else: recordExist = True infoprinter.printPara('Number of records',str(len(rows))) ids = rows finally: update_doc_lock.release() # open document size file to write f_docsize = open(dochealthcheck_config.outputdir+dochealthcheck_config.f_docsize,'w') f_docsize.write('crawlid byte\n') # open unhealthy document to write f_unhealthdoc = open(dochealthcheck_config.outputdir+dochealthcheck_config.f_unhealthdoc,'w') f_unhealthdoc.write('unhealth_crawlid\n') # start checking each file counters.setCounter('all',len(ids)) ids_unhealth = [] for id in ids: # construct the full document path from the document ID infile = dochealthcheck_config.inputdir+idtopath(id['id']) # check if file exists if not os.path.exists(infile): infoprinter.printStatus('file exists','no') continue counters.addCounter('inrepo') # check file size in bytes statinfo = os.stat(infile) s = str(id['id'])+' '+str(statinfo.st_size) f_docsize.write(s+'\n') # check the file type cmd_file = 'file -i "'+infile+'"' cmdoutput = commands.getoutput(cmd_file) if verbose: print cmdoutput # check each accepted document, documents whose mimetypes are not # in the accepted mime types are identified as "unhealthy" healthy = False for am in dochealthcheck_config.accepted_mimes: if am in cmdoutput: healthy = True print 'document is healthy',id['id'] counters.addCounter('healthy') break if healthy: continue print "unhealthy document: ",id['id'] # write unheathy document ID to output file f_unhealthdoc.write(str(id['id'])+'\n') ids_unhealth.append(id['id']) # delete file folder from repository if dochealthcheck_config.toggle_delete_from_repo: infiledir = os.path.dirname(infile) cmd_repo = 'rm -rf '+infiledir cmd_repo_output = commands.getoutput(cmd_repo) if not os.path.exists(infiledir): infoprinter.printStatus(cmd_repo,'OK') else: infoprinter.printStatus(cmd_repo,'FAIL') return # delete records from database if dochealthcheck_config.toggle_delete_from_db: # delete the record from database cmd_db = 'DELETE FROM '+dochealthcheck_config.dbt_name+' WHERE id='+str(id['id']) print cmd_db cursor.execute(cmd_db) # close filese f_docsize.close() f_unhealthdoc.close() # commit all transactions after looping over all documents if dochealthcheck_config.toggle_delete_from_db: transaction.commit_unless_managed() # print out counters counters.printCounter()