def main(): global FLAG_SLEEPING # will be set True if sleeping # get the list of feeds from the 'feeds' directory in the same directory with this script feedLists = glob.glob(crawling_config.DIR_FEED_LISTS + '*') while (True): # get the file queue for this run (this will lock the queue until it finishes) feedQueue = FileQueue(crawling_config.DIR_QUEUE, crawling_config.DIR_LOCKS) saved = 0 # goes through feeds for feedList in feedLists: feedLock = FileLock(crawling_config.DIR_LOCKS + os.path.basename(feedList)) if (feedLock.isLocked() == False): # the feed is not locked atm, we may proceed... # but first, we have to lock it feedLock.lock() try: feedFile = open(feedList, 'r') crawling_config.DEBUG('Processing %s' % feedList) feedMode = feedFile.readline().strip() feedModeConfig = feedFile.readline().strip() feedUrls = feedFile.read().strip().split('\n') feedFile.close() for feedUrl in feedUrls: feedUrlParts = feedUrl.split(' ') links = readFeed(feedUrlParts[0], feedMode, feedModeConfig, feedUrlParts[1]) saved += feedQueue.saveList(links) except IOError: pass # removes the lock for other sessions feedLock.unlock() else: # oops, it's locked atm crawling_config.DEBUG('Bypassed %s because it is being locked.' % feedList) # frees the file queue feedQueue.close() crawling_config.DEBUG('Saved %d links to queue.' % saved) # checks if SIGINT is sent to this script if (FLAG_EXIT_NOW): print 'Stopping now!' break FLAG_SLEEPING = True crawling_config.DEBUG('Sleeping for %d seconds' % crawling_config.SLEEP_TIME) time.sleep(crawling_config.SLEEP_TIME) FLAG_SLEEPING = False print 'Bye bye'