def process_item(self, item, spider): log.msg("Processing \"%s - %s\"(playid: %s, playdatetime: %s)." % (item['artist'], item['songtitle'], item['playid'], item['playdatetime']), level=log.DEBUG) if settings.DROPCOUNT_CONTROL != 0 and self.drops.in_row > (settings.DROPCOUNT_CONTROL - 1): # #DebugTool: Limit number of times an item can be dropped. print "*************************************" print "Drop limit met. Goodbye." print "*************************************" crawler._signal_shutdown(9,0) #Kills the Spider elif self.check_duplicate(self.databaseTable, item, ['playid', 'station']): log.msg("Duplicate item found. Dropping \"%s - %s\"(playid: %s, playdatetime: %s)." % (item['artist'], item['songtitle'], item['playid'], item['playdatetime']), level=log.ERROR) print "Duplicate item found. Dropping \"%s - %s\"(%s)." % (item['artist'], item['songtitle'], self.drops.in_row) raise DropItem("Item already exists in db.") else: log.msg("No duplicate found. Inserting \"%s - %s\"(playid: %s, playdatetime: %s)." % (item['artist'], item['songtitle'], item['playid'], item['playdatetime']), level=log.DEBUG) self.insert_item(self.databaseTable, item) return item
def __init__(self): # Connect to database on initialization (we want this to run once per pipeline) self.drops = counter('Dropped Songs') self.inserts = counter('Inserted Songs') databaseName = settings.databaseName databaseUser = settings.databaseUser databasePswd = settings.databasePswd databaseHost = settings.databaseHost self.databaseTable = settings.KEXPdatabaseTable conn_string = "dbname='%s' user='******' password='******' host='%s'" % (databaseName, databaseUser, databasePswd, databaseHost) try: self.conn = psycopg2.connect(conn_string) log.msg("Successfully connected to database \"%s\''." % (databaseName), level=log.INFO) self.cur = self.conn.cursor() except: # Get the most recent exception exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() log.msg("Failed to connect to database (%s). Shutting down spider." % exceptionValue, level=log.ERROR) crawler._signal_shutdown(9,0) #Kills the Spider if connection fails.