예제 #1
0
def cmd(config, args):
	
	if (args.command not in avaliable_cmds):
		raise Exception("not a valid command...")

	nid = args.node_id
	
	logger.info("node_id: %s"%(nid))
	node_queue = NodeQueue(nid, redis_config=config['redis_config'])
	node_coordinator = NodeCoordinator(config['redis_config'])
	# this can be done locally without sending the command to the servers...
	if (args.command == 'GET_UIDS_FROM_SCREEN_NAMES'):
		apikeys = config["apikeys"].values()[0]
		if (not os.path.exists(args.json)):
			raise Exception("doesn't exist... ")
		with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
			screen_names = json.load(f)
			user_api = User(apikeys=apikeys)
			user_ids = user_api.get_user_ids_by_screen_names(screen_names)
			json.dump(list(user_ids), o_f)
	elif (args.command == 'GET_USERS_FROM_IDS'):
		apikeys = config["apikeys"].values()[0]
		if (not os.path.exists(args.json)):
			raise Exception("doesn't exist... ")
		with open(os.path.abspath(args.json), 'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
			user_ids = json.load(f)
			user_api = User(apikeys=apikeys)
			users = user_api.get_users(user_ids)
			json.dump(list(users), o_f)
	elif (args.command.startswith('BATCH_')):
		command = args.command.replace('BATCH_', '')
		args_dict = copy.copy(args.__dict__)
		if (not os.path.exists(args.json)):
			raise Exception("doesn't exist... ")
		with open(os.path.abspath(args.json), 'rb') as f:
			user_ids = json.load(f)
			for user_id in user_ids:
				args_dict['user_id'] = user_id
				cmd = new_cmd(command, args_dict)
				node_queue.put(cmd)
	elif (args.command == 'LIST_NODES'):
		pp.pprint(node_coordinator.list_nodes())
	elif (args.command == 'NODE_QSIZES'):
		raise NotImplemented("NotImplemented yet...")
		#pp.pprint(node_coordinator.list_nodes())
	elif (args.command == 'SHUTDOWN_NODE'):
		#node_coordinator.remove_node(nid)
		#pp.pprint(node_coordinator.list_nodes())
		raise NotImplemented("NotImplemented yet...")
	elif (args.command == 'CLEAR_NODE_QUEUES'):
		node_queue.clear_all_queues()
	else:
		args_dict = copy.copy(args.__dict__)
		cmd = new_cmd(args.command, args_dict)
		node_queue.put(cmd)
		logger.info('sent [%s]'%(cmd))
def flush_cmd(bulk, data_type, template, redis_config):

	try:
		node_coordinator = NodeCoordinator(redis_config=redis_config)

		qsizes = node_coordinator.node_qsizes()

		logger.debug(qsizes)
		
		node_queues = {}

		for element in bulk:
			if data_type == "ids" and type(element) == int:
				user_id = element
			elif data_type =="users" and type(element) == dict and "id" in element:
				user_id = element['id']
			
			t = copy.copy(template)
			t["user_id"] = int(user_id)
			t["depth"] = int(t["depth"]) -1

			node_id = get_keys_by_min_value(qsizes)[0]

			if (node_id in node_queues):
				node_queue = node_queues[node_id]
			else:
				node_queue = NodeQueue(node_id, redis_config=redis_config)
				node_queues[node_id] = node_queue


			t['cmd_hash'] = hash_cmd(t)
			node_queue.put(t)
			qsizes[node_id] += 1

			logger.debug("send [%s] to node: %s"%(json.dumps(t),node_id))

		# intend to close all redis connections, but not sure yet...
		node_queues.clear()

		del node_coordinator

			
	except Exception as exc:
		logger.error('error during flush: %s'%exc)

	return True
def flush_cmd(bulk, data_type, template, redis_config):

    try:
        node_coordinator = NodeCoordinator(redis_config=redis_config)

        qsizes = node_coordinator.node_qsizes()

        logger.debug(qsizes)

        node_queues = {}

        for element in bulk:
            if data_type == "ids" and type(element) == int:
                user_id = element
            elif data_type == "users" and type(
                    element) == dict and "id" in element:
                user_id = element['id']

            t = copy.copy(template)
            t["user_id"] = int(user_id)
            t["depth"] = int(t["depth"]) - 1

            node_id = get_keys_by_min_value(qsizes)[0]

            if (node_id in node_queues):
                node_queue = node_queues[node_id]
            else:
                node_queue = NodeQueue(node_id, redis_config=redis_config)
                node_queues[node_id] = node_queue

            t['cmd_hash'] = hash_cmd(t)
            node_queue.put(t)
            qsizes[node_id] += 1

            logger.debug("send [%s] to node: %s" % (json.dumps(t), node_id))

        # intend to close all redis connections, but not sure yet...
        node_queues.clear()

        del node_coordinator

    except Exception as exc:
        logger.error('error during flush: %s' % exc)

    return True
예제 #4
0
    def __init__(self, node_id, config={}, proxies=[]):
        self.node_id = node_id
        self.config = config
        if (proxies and len(proxies) > 0):

            self.proxy_list = proxy_checker(proxies)

            logger.info("number of live proxies: %d" % (len(self.proxy_list)))

            # each process only get one apikey...  if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails.
            number_of_processes = min(len(self.config['apikeys']),
                                      len(self.proxy_list))

            # if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy
            self.proxy_generator = self.split(self.proxy_list,
                                              number_of_processes)

        else:
            self.proxy_list = None
            self.proxy_generator = None
            number_of_processes = 1

        logger.info("number of crawlers: %d" % (number_of_processes))

        apikey_list = self.config['apikeys'].keys()

        self.crawlers = {}
        for idx in range(number_of_processes):
            try:
                self.new_crawler(self.node_id,
                                 self.config['apikeys'][apikey_list[idx]],
                                 config)
            except Exception as exc:
                logger.error(exc)
                pass

        self.node_coordinator = NodeCoordinator(config['redis_config'])
        self.node_coordinator.add_node(node_id)

        logger.info("number of crawlers: %d created" % (number_of_processes))
예제 #5
0
	def __init__(self, node_id, config={}, proxies=[]):
		self.node_id = node_id
		self.config = config
		if (len(proxies) > 0):
			
			self.proxy_list = proxy_checker(proxies)

			logger.info("number of live proxies: %d"%(len(self.proxy_list)))

			# each process only get one apikey...  if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails. 
			number_of_processes = min(len(self.config['apikeys']), len(self.proxy_list))

			# if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy
			self.proxy_generator = self.split(self.proxy_list, number_of_processes)

		else:
			self.proxy_list = None
			self.proxy_generator = None
			number_of_processes = 1

		logger.info("number of crawlers: %d"%(number_of_processes))

		apikey_list = self.config['apikeys'].keys()


		self.crawlers = {}
		for idx in range(number_of_processes):
			try:
				self.new_crawler(self.node_id, self.config['apikeys'][apikey_list[idx]], config)
			except Exception as exc:
				logger.error(exc)
				pass


		self.node_coordinator = NodeCoordinator(config['redis_config'])
		self.node_coordinator.add_node(node_id)

		logger.info("number of crawlers: %d created"%(number_of_processes))
예제 #6
0
class Scheduler(object):
    def __init__(self, node_id, config={}, proxies=[]):
        self.node_id = node_id
        self.config = config
        if (proxies and len(proxies) > 0):

            self.proxy_list = proxy_checker(proxies)

            logger.info("number of live proxies: %d" % (len(self.proxy_list)))

            # each process only get one apikey...  if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails.
            number_of_processes = min(len(self.config['apikeys']),
                                      len(self.proxy_list))

            # if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy
            self.proxy_generator = self.split(self.proxy_list,
                                              number_of_processes)

        else:
            self.proxy_list = None
            self.proxy_generator = None
            number_of_processes = 1

        logger.info("number of crawlers: %d" % (number_of_processes))

        apikey_list = self.config['apikeys'].keys()

        self.crawlers = {}
        for idx in range(number_of_processes):
            try:
                self.new_crawler(self.node_id,
                                 self.config['apikeys'][apikey_list[idx]],
                                 config)
            except Exception as exc:
                logger.error(exc)
                pass

        self.node_coordinator = NodeCoordinator(config['redis_config'])
        self.node_coordinator.add_node(node_id)

        logger.info("number of crawlers: %d created" % (number_of_processes))

    def new_crawler(self, node_id, apikeys, config, crawler_proxies=None):
        file_handler_config = {
            "name": "FileHandler",
            "args": {
                "output_folder": config["output"]
            }
        }

        crawler_id = apikeys['app_key']
        logger.debug('creating a new crawler: %s' % crawler_id)
        if (not crawler_proxies):
            crawler_proxies = next(
                self.proxy_generator) if self.proxy_generator else None

        crawler = TwitterCrawler(
            node_id,
            crawler_id,
            copy.copy(apikeys),
            handlers=[create_handler(file_handler_config)],
            redis_config=copy.copy(config['redis_config']),
            proxies=crawler_proxies)

        if (crawler_id in self.crawlers):
            #self.crawlers[crawler_id].clear()
            del self.crawlers[crawler_id]

        self.crawlers[crawler_id] = {
            'apikeys':
            apikeys,
            'crawler':
            crawler,
            'crawler_queue':
            CrawlerQueue(self.node_id,
                         crawler_id,
                         redis_config=copy.copy(config['redis_config'])),
            'crawler_proxies':
            crawler_proxies
        }
        crawler.start()

    def is_alive(self):
        a = [
            1 if self.crawlers[crawler_id]['crawler'].is_alive() else 0
            for crawler_id in self.crawlers
        ]
        return sum(a) > 0

    def crawler_status(self):
        status = []
        for crawler_id in self.crawlers:
            cc = self.crawlers[crawler_id]
            if ((not cc['crawler'].is_alive())):

                if ('retry_timer_start_ts' in cc
                        and (time.time() - cc['retry_timer_start_ts'] > 1800)):
                    # retry 30 mins after the crawler dies... mostly the crawler died because "Twitter API returned a 503 (Service Unavailable), Over capacity"
                    self.new_crawler(self.node_id, cc['apikeys'], self.config,
                                     cc['crawler_proxies'])
                    cc = self.crawlers[crawler_id]
                    logger.info('[%s] has been recrated...' % (crawler_id))
                else:
                    if ('retry_timer_start_ts' not in cc):
                        cc['retry_timer_start_ts'] = int(time.time())
                    else:
                        logger.warn(
                            '[%s] failed; waiting to recreat in %f mins...' %
                            (crawler_id,
                             (time.time() + 1800 - cc['retry_timer_start_ts'])
                             / float(60)))

            status.append({
                'crawler_id': crawler_id,
                'alive?': cc['crawler'].is_alive(),
                'qsize': cc['crawler_queue'].qsize(),
                'crawler_queue_key': cc['crawler_queue'].get_key()
            })

        return status

    def balancing_load(self):
        '''
		Find the crawler that has the most load at this moment, and redistribut its item;
		Crawler is on a different subprocess, so we have to use redis to coordinate the redistribution...
		'''

        sorted_queues = self.sorted_local_queue(False)
        max_crawler_id, max_qsize = sorted_queues[-1]
        min_crawler_id, min_qsize = sorted_queues[0]
        logger.info("crawler with max_qsize: %s (%d)" %
                    (max_crawler_id, max_qsize))
        logger.info("crawler with min_qsize: %s (%d)" %
                    (min_crawler_id, min_qsize))
        logger.info("max_qsize - min_qsize > 0.5 * min_qsize ?: %r" %
                    ((max_qsize - min_qsize > 0.5 * min_qsize)))
        if (max_qsize - min_qsize > 0.5 * min_qsize):
            logger.info("load balancing process started...")
            cmds = []
            controls = []
            for i in range(int(0.3 * (max_qsize - min_qsize))):
                cmd = self.crawlers[max_crawler_id]['crawler_queue'].get()
                if (cmd['cmd'] in control_cmds):
                    controls.append(cmd)
                else:
                    cmds.append(cmd)

            # push control cmds back..
            for cmd in controls:
                self.crawlers[max_crawler_id]['crawler_queue'].put(cmd)

            logger.info("redistribute %d cmds" % len(cmds))
            for cmd in cmds:
                self.enqueue(cmd)

    def redistribute_crawler_queue(self, crawler_id):
        if (crawler_id in self.crawlers):
            logger.warn('%s just failed... redistributing its workload' %
                        (crawler_id))
            try:
                self.node_coordinator.distribute_to_nodes(
                    self.crawlers[crawler_id]['crawler_queue'])
                wait_timer = 180
                # wait until it dies (flushed all the data...)
                while (self.crawlers[crawler_id]['crawler'].is_alive()
                       and wait_timer > 0):
                    time.sleep(60)
                    wait_timer -= 60

                self.crawlers[crawler_id]['retry_timer_start_ts'] = int(
                    time.time())
            except Exception as exc:
                logger.error(full_stack())
        else:
            logger.warn(
                "whatever are you trying to do? crawler_id: [%s] is not valid..."
                % (crawler_id))

    def enqueue(self, cmd):

        if (cmd['cmd'] == 'TERMINATE'):
            [
                self.crawlers[crawler_id]['crawler_queue'].put(cmd)
                for crawler_id in self.crawlers
            ]
        elif (cmd['cmd'] == 'CRAWLER_FLUSH'):
            [
                self.crawlers[crawler_id]['crawler_queue'].put(cmd)
                for crawler_id in self.crawlers
            ]
        elif (cmd['cmd'] == 'BALANCING_LOAD'):
            self.balancing_load()
        elif (cmd['cmd'] == 'CRAWLER_FAILED'):
            crawler_id = cmd['crawler_id']
            self.redistribute_crawler_queue(crawler_id)
        else:
            '''distribute item to the local crawler that has the least tasks in queue'''
            for crawler_id, qsize in self.sorted_local_queue(False):
                if self.crawlers[crawler_id]['crawler'].is_alive():
                    self.crawlers[crawler_id]['crawler_queue'].put(cmd)

                    logger.debug("pushed %s to crawler: %s" %
                                 (cmd, crawler_id))
                    break

    def check_crawler_qsizes(self):
        return {
            crawler_id: self.crawlers[crawler_id]['crawler_queue'].qsize()
            for crawler_id in self.crawlers
        }

    def sorted_local_queue(self, reverse=False):
        local_qsizes = self.check_crawler_qsizes()
        return sorted(local_qsizes.iteritems(),
                      key=itemgetter(1),
                      reverse=reverse)

    def split(self, lst, n):
        """ Yield successive n chunks of even sized sub-lists from lst."""
        lsize = {}
        results = {}
        for i in range(n):
            lsize[i] = 0
            results[i] = []

        for x in lst:
            idx = get_keys_by_min_value(lsize)[0]
            results[idx].append(x)
            lsize[idx] += 1

        for i in range(n):
            yield results[i]
예제 #7
0
def start_server(config, proxies):

    check_config(config)
    config = copy.copy(config)

    folders_to_create = []
    buckets = [
        "tweets", "followers", "follower_ids", "friends", "friend_ids",
        "timelines"
    ]

    ouput_folder = os.path.abspath(config['output'])
    archive_output = os.path.abspath(
        config['archive_output']) if config['archive_output'] else ouput_folder
    archive_output = os.path.join(archive_output, 'archived')

    folders_to_create.append(ouput_folder)
    folders_to_create.append(archive_output)

    for bucket in buckets:
        folders_to_create.append(os.path.join(ouput_folder, bucket))
        folders_to_create.append(os.path.join(archive_output, bucket))

    for folder_to_create in folders_to_create:
        if not os.path.exists(folder_to_create):
            os.makedirs(folder_to_create)

    logger.info("output to %s" % ouput_folder)
    logger.info("archived to %s" % archive_output)

    this_node_id = node_id()
    node_queue = NodeQueue(this_node_id, redis_config=config['redis_config'])
    node_queue.clear()

    scheduler = Scheduler(this_node_id, config=config, proxies=proxies)

    logger.info('starting node_id: %s' % this_node_id)

    node_coordinator = NodeCoordinator(config['redis_config'])
    # node_coordinator.clear()

    # the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler;
    # but we need one to report the status of each crawler and perform the tarball tashs...

    last_archive_ts = time.time(
    ) + 3600  # the first archive event starts 2 hrs later...
    pre_time = time.time()
    last_load_balancing_task_ts = time.time()
    while True:

        if time.time() - pre_time > 120:
            logger.info(pprint.pformat(scheduler.crawler_status()))
            pre_time = time.time()
            if scheduler.is_alive():
                cmd = {'cmd': 'CRAWLER_FLUSH'}
                scheduler.enqueue(cmd)

        if time.time() - last_archive_ts > 3600:

            logger.info("start archive procedure...")
            with concurrent.futures.ProcessPoolExecutor(
                    max_workers=len(buckets)) as executor:

                future_proxies = {
                    executor.submit(tarball_results, ouput_folder, bucket,
                                    archive_output,
                                    int(time.time()) - 3600): bucket
                    for bucket in buckets
                }

                for future in future_proxies:
                    future.add_done_callback(lambda f: logger.info(
                        "archive created? %s: [%s]" % f.result()))

            last_archive_ts = time.time()

        # block, the main process...for a command
        if not scheduler.is_alive():
            logger.info(
                "no crawler is alive... waiting to recreate all crawlers...")
            time.sleep(120)  # sleep for a minute and retry
            continue

        if time.time(
        ) - last_load_balancing_task_ts > 1800:  # try to balance the local queues every 30 mins
            last_load_balancing_task_ts = time.time()
            cmd = {'cmd': 'BALANCING_LOAD'}
            scheduler.enqueue(cmd)

        cmd = node_queue.get(block=True, timeout=360)

        if cmd:
            scheduler.enqueue(cmd)
예제 #8
0
class Scheduler(object):

	def __init__(self, node_id, config={}, proxies=[]):
		self.node_id = node_id
		self.config = config
		if (len(proxies) > 0):
			
			self.proxy_list = proxy_checker(proxies)

			logger.info("number of live proxies: %d"%(len(self.proxy_list)))

			# each process only get one apikey...  if there are more proxies than apikeys, each process can get more than one proxy that can be rotated when one fails. 
			number_of_processes = min(len(self.config['apikeys']), len(self.proxy_list))

			# if there are more proxies than apikeys, then each process will get a list of proxies, and the process will restart it self if a proxy failed, and try the next available proxy
			self.proxy_generator = self.split(self.proxy_list, number_of_processes)

		else:
			self.proxy_list = None
			self.proxy_generator = None
			number_of_processes = 1

		logger.info("number of crawlers: %d"%(number_of_processes))

		apikey_list = self.config['apikeys'].keys()


		self.crawlers = {}
		for idx in range(number_of_processes):
			try:
				self.new_crawler(self.node_id, self.config['apikeys'][apikey_list[idx]], config)
			except Exception as exc:
				logger.error(exc)
				pass


		self.node_coordinator = NodeCoordinator(config['redis_config'])
		self.node_coordinator.add_node(node_id)

		logger.info("number of crawlers: %d created"%(number_of_processes))

	def new_crawler(self, node_id, apikeys, config, crawler_proxies = None):
		file_handler_config = {
			"name": "FileHandler",
			"args": {
				"output_folder" : config["output"]
			}
		}

		# try:
			#crawler_id = md5('%s:%s'%(self.node_id, idx))
			#apikeys = self.config['apikeys'][apikey_list[idx]]
		crawler_id = apikeys['app_key']
		logger.debug('creating a new crawler: %s'%crawler_id)
		if (not crawler_proxies):
			crawler_proxies = next(self.proxy_generator) if self.proxy_generator else None

		crawler = UserRelationshipCrawler(node_id, crawler_id, copy.copy(apikeys), handlers=[create_handler(file_handler_config)], redis_config=copy.copy(config['redis_config']), proxies=crawler_proxies)
		
		if (crawler_id in self.crawlers):
			#self.crawlers[crawler_id].clear()
			del self.crawlers[crawler_id]

		self.crawlers[crawler_id] = {
			'apikeys': apikeys,
			'crawler': crawler,
			'crawler_queue': CrawlerQueue(self.node_id, crawler_id, redis_config=copy.copy(config['redis_config'])),
			'crawler_proxies': crawler_proxies
		}
		crawler.start()
		# except twython.exceptions.TwythonAuthError as exc:
		# 	logger.error('%s: %s'%(exc, apikeys))
		# except Exception as exc:
		# 	logger.error(exc)
		# 	raise


	def is_alive(self):
		a = [1 if self.crawlers[crawler_id]['crawler'].is_alive() else 0 for crawler_id in self.crawlers]
		return sum(a) > 0

	def crawler_status(self):
		status = []
		for crawler_id in self.crawlers:
			cc = self.crawlers[crawler_id]
			if ((not cc['crawler'].is_alive())): 
				
				if ('retry_timer_start_ts' in cc and (time.time() - cc['retry_timer_start_ts'] > 1800)):
					# retry 30 mins after the crawler dies... mostly the crawler died because "Twitter API returned a 503 (Service Unavailable), Over capacity"
					self.new_crawler(self.node_id, cc['apikeys'], self.config, cc['crawler_proxies'])
					cc = self.crawlers[crawler_id]
					logger.info('[%s] has been recrated...'%(crawler_id))
				else:
					if('retry_timer_start_ts' not in cc):
						cc['retry_timer_start_ts'] = int(time.time())
					else:
						logger.warn('[%s] failed; waiting to recreat in %f mins...'%(crawler_id, (time.time() + 1800 - cc['retry_timer_start_ts'])/float(60)))

			status.append({'crawler_id':crawler_id, 'alive?': cc['crawler'].is_alive(), 'qsize': cc['crawler_queue'].qsize(), 'crawler_queue_key': cc['crawler_queue'].get_key()})

		return status

	def balancing_load(self):
		'''
		Find the crawler that has the most load at this moment, and redistribut its item;
		Crawler is on a different subprocess, so we have to use redis to coordinate the redistribution...
		'''

		sorted_queues = self.sorted_local_queue(False)
		max_crawler_id, max_qsize = sorted_queues[-1]
		min_crawler_id, min_qsize = sorted_queues[0]
		logger.info("crawler with max_qsize: %s (%d)"%(max_crawler_id, max_qsize))
		logger.info("crawler with min_qsize: %s (%d)"%(min_crawler_id, min_qsize))
		logger.info("max_qsize - min_qsize > 0.5 * min_qsize ?: %r"%((max_qsize - min_qsize > 0.5 * min_qsize)))
		if (max_qsize - min_qsize > 0.5 * min_qsize):
			logger.info("load balancing process started...")
			cmds = []
			controls = []
			for i in range(int(0.3 * (max_qsize - min_qsize))):
				cmd = self.crawlers[max_crawler_id]['crawler_queue'].get()
				if (cmd['cmd'] in control_cmds):
					controls.append(cmd)
				else:
					cmds.append(cmd)

			# push control cmds back..
			for cmd in controls:
				self.crawlers[max_crawler_id]['crawler_queue'].put(cmd)

			logger.info("redistribute %d cmds"%len(cmds))
			for cmd in cmds:
				self.enqueue(cmd)

	def redistribute_crawler_queue(self, crawler_id):
		if (crawler_id in self.crawlers):
			logger.warn('%s just failed... redistributing its workload'%(crawler_id))
			try:
				self.node_coordinator.distribute_to_nodes(self.crawlers[crawler_id]['crawler_queue'])
				wait_timer = 180
				# wait until it dies (flushed all the data...)
				while(self.crawlers[crawler_id]['crawler'].is_alive() and wait_timer > 0):
					time.sleep(60)
					wait_timer -= 60

				self.crawlers[crawler_id]['retry_timer_start_ts'] = int(time.time())
			except Exception as exc:
				logger.error(full_stack())
		else:
			logger.warn("whatever are you trying to do? crawler_id: [%s] is not valid..."%(crawler_id))

	def enqueue(self, cmd):
		
		if (cmd['cmd'] == 'TERMINATE'):			
			[self.crawlers[crawler_id]['crawler_queue'].put(cmd) for crawler_id in self.crawlers]
		elif(cmd['cmd'] == 'CRAWLER_FLUSH'):
			[self.crawlers[crawler_id]['crawler_queue'].put(cmd) for crawler_id in self.crawlers]
		elif(cmd['cmd'] == 'BALANCING_LOAD'):
			self.balancing_load()
		elif(cmd['cmd'] == 'CRAWLER_FAILED'):
			crawler_id = cmd['crawler_id']
			self.redistribute_crawler_queue(crawler_id)
		else:
			'''distribute item to the local crawler that has the least tasks in queue'''
			for crawler_id, qsize in self.sorted_local_queue(False):
				if self.crawlers[crawler_id]['crawler'].is_alive():
					self.crawlers[crawler_id]['crawler_queue'].put(cmd)

					logger.debug("pushed %s to crawler: %s"%(cmd, crawler_id))
					break

	def check_crawler_qsizes(self):
		return {crawler_id:self.crawlers[crawler_id]['crawler_queue'].qsize() for crawler_id in self.crawlers}

	def sorted_local_queue(self, reverse=False):
		local_qsizes = self.check_crawler_qsizes()
		return sorted(local_qsizes.iteritems(), key=itemgetter(1), reverse=reverse)

	def split(self, lst, n):
		""" Yield successive n chunks of even sized sub-lists from lst."""
		lsize = {}
		results = {}
		for i in range(n):
			lsize[i] = 0
			results[i] = []

		
		for x in lst:
			idx = get_keys_by_min_value(lsize)[0]
			results[idx].append(x)
			lsize[idx] += 1

		for i in range(n):
			yield results[i]
예제 #9
0
def cmd(config, args):

    if (args.command not in avaliable_cmds):
        raise Exception("not a valid command...")

    nid = args.node_id

    logger.info("node_id: %s" % (nid))
    node_queue = NodeQueue(nid, redis_config=config['redis_config'])
    node_coordinator = NodeCoordinator(config['redis_config'])
    # this can be done locally without sending the command to the servers...
    if (args.command == 'GET_UIDS_FROM_SCREEN_NAMES'):
        apikeys = config["apikeys"].values()[0]
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json),
                  'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
            screen_names = json.load(f)
            twitter_api = TwitterAPI(apikeys=apikeys)
            user_ids = twitter_api.get_user_ids_by_screen_names(screen_names)
            json.dump(list(user_ids), o_f)
    elif (args.command == 'GET_USERS_FROM_IDS'):
        apikeys = config["apikeys"].values()[0]
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json),
                  'rb') as f, open(os.path.abspath(args.output), 'wb') as o_f:
            user_ids = json.load(f)
            twitter_api = TwitterAPI(apikeys=apikeys)
            users = twitter_api.get_users(user_ids)
            json.dump(list(users), o_f)
    elif (args.command.startswith('BATCH_')):
        new_command = args.command.replace('BATCH_', '')
        args_dict = copy.copy(args.__dict__)
        if (not os.path.exists(args.json)):
            raise Exception("doesn't exist... ")
        with open(os.path.abspath(args.json), 'rb') as f:
            if (args.command == 'BATCH_CRAWL_TWEET'):
                tweet_ids = json.load(f)
                for tweet_id in tweet_ids:
                    print "Loading Tweet ID: ", tweet_id
                    args_dict['tweet_id'] = tweet_id
                    cmd = new_cmd(new_command, args_dict)
                    node_queue.put(cmd)
            else:
                user_ids = json.load(f)
                for user_id in user_ids:
                    args_dict['user_id'] = user_id
                    cmd = new_cmd(new_command, args_dict)
                    node_queue.put(cmd)
    elif (args.command == 'LIST_NODES'):
        pp.pprint(node_coordinator.list_nodes())
    elif (args.command == 'NODE_QSIZES'):
        raise NotImplemented("NotImplemented yet...")
        #pp.pprint(node_coordinator.list_nodes())
    elif (args.command == 'SHUTDOWN_NODE'):
        #node_coordinator.remove_node(nid)
        #pp.pprint(node_coordinator.list_nodes())
        raise NotImplemented("NotImplemented yet...")
    elif (args.command == 'CLEAR_NODE_QUEUES'):
        node_queue.clear_all_queues()
    else:
        args_dict = copy.copy(args.__dict__)
        cmd = new_cmd(args.command, args_dict)
        node_queue.put(cmd)
        logger.info('sent [%s]' % (cmd))