def collector_control(collector_id): """ POST control route for collector forms """ collector_form = ProcessControlForm(request.form) task = None # On form submit controls the processor if request.method == 'POST' and collector_form.validate(): command = request.form['control'].lower() task_args = { 'process': 'collect', 'project': g.project, 'collector_id': collector_id } db = DB() collector = db.get_collector_detail(g.project['project_id'], collector_id) network = collector['collector']['network'] if command == 'start': task = start_daemon.apply_async(kwargs=task_args, queue='stack-start') elif command == 'stop': task = stop_daemon.apply_async(kwargs=task_args, queue='stack-stop') elif command == 'restart': task = restart_daemon.apply_async(kwargs=task_args, queue='stack-start') return redirect( url_for('collector', project_name=g.project['project_name'], network=network, collector_id=collector_id, task_id=task.task_id))
def collector(project_name, network, collector_id, task_id=None): """ Loads the detail / control page for a collector """ # Redirects an admin back to the homepage b/c nothing is loaded into the session yet if g.project is None: flash( 'Please navigate to the New Collector page from your homepage panel.' ) return redirect(url_for('index')) form = ProcessControlForm(request.form) # Loads collector info for the page db = DB() resp = db.get_collector_detail(g.project['project_id'], collector_id) collector = resp['collector'] # Loads active status resp = db.check_process_status(g.project['project_id'], 'collect', collector_id=collector_id) active_status = resp['message'] # If a start/stop/restart is in progress, display the status task_status = None if task_id: resp = celery.AsyncResult(task_id) if resp.state == 'PENDING': task_status = 'Collector start/shutdown still in progress...' else: task_status = 'Collector start/shutdown completed.' return render_template('collector.html', collector=collector, active_status=active_status, form=form, task_status=task_status)
class Controller(object): """ Controller - A class for controlling STACK processes. Calls the Process() class to start and stop STACK processes. """ def __init__(self, process, cmdline=False, home_dir='.', umask=0o22, verbose=1, **kwargs): self.db = DB() self.process = process self.cmdline = cmdline self.usage_message = 'controller collect|process|insert start|stop|restart project_id collector_id' self.home_dir = home_dir self.umask = umask self.verbose = verbose if self.cmdline is False: # Grab information from Flask user object self.project = kwargs['project'] self.project_id = self.project['project_id'] self.project_name = self.project['project_name'] else: # Command is coming from the command line, look up info self.project_id = kwargs['project_id'] resp = self.db.get_project_detail(self.project_id) if resp['status']: self.project_name = resp['project_name'] else: print('Project w/ ID %s not found!' % self.project_id) print('') print('USAGE: python %s %s' % (sys.argv[0], self.usage_message)) sys.exit(1) # Project account DB connection project_info = self.db.get_project_detail(self.project_id) configdb = project_info['project_config_db'] project_config_db = self.db.connection[configdb] self.projectdb = project_config_db.config # Loads info for process based on type: collector, processor, inserter if self.process in ['process', 'insert']: # Only module type needed for processor / inserter self.module = kwargs['network'] self.collector_id = None # Set name for worker based on gathered info self.process_name = self.project_name + '-' + self.process + '-' + self.module + '-' + self.project_id elif process == 'collect': # For collectors, also grabs: collector_id, api, collector_name self.collector_id = kwargs['collector_id'] resp = self.db.get_collector_detail(self.project_id, self.collector_id) if resp['status']: collector = resp['collector'] self.module = collector['network'] self.api = collector['api'] self.collector_name = collector['collector_name'] else: print('Collector (ID: %s) not found!' % self.collector_id) print('') print('USAGE: python %s %s' % (sys.argv[0], self.usage_message)) sys.exit(1) # Set name for worker based on gathered info self.process_name = self.project_name + '-' + self.collector_name + '-' + self.process + '-' + self.module + \ '-' + self.collector_id # Sets out directories self.piddir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/pid' self.logdir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/logs' self.stddir = app.config['LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/std' # Sets data dirs # TODO - deprecate w/ Facebook self.rawdir = app.config[ 'DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + '/raw' self.archdir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + \ '/archive' self.insertdir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + \ '/insert_queue' # self.rawdir = "Users/harshita/Downloads/20200608-09-ClimateChange1-5ec8d268f57962092d668731\-5ec8d3cef57962093753ca79-tweets_out.json" # self.archdir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + \ # '/archive' # self.insertdir = app.config['DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.module + \ # '/insert_queue' # Creates dirs if they don't already exist if not os.path.exists(self.piddir): os.makedirs(self.piddir) if not os.path.exists(self.stddir): os.makedirs(self.stddir) # These directories only need be created for Twitter # TODO - deprecate w/ Facebook if self.module == 'twitter': if not os.path.exists(self.logdir): os.makedirs(self.logdir) if not os.path.exists(self.rawdir): os.makedirs(self.rawdir) if not os.path.exists(self.archdir): os.makedirs(self.archdir) if not os.path.exists(self.insertdir): os.makedirs(self.insertdir) # Sets outfiles self.pidfile = self.piddir + '/%s.pid' % self.process_name self.stdout = self.stddir + '/%s-stdout.txt' % self.process_name self.stderr = self.stddir + '/%s-stderr.txt' % self.process_name self.stdin = self.stddir + '/%s-stdin.txt' % self.process_name # Creates the std files for the daemon if not os.path.isfile(self.stdout): create_file = open(self.stdout, 'w') create_file.close() if not os.path.isfile(self.stdin): create_file = open(self.stdin, 'w') create_file.close() if not os.path.isfile(self.stderr): create_file = open(self.stderr, 'w') create_file.close() def process_command(self, cmd): """ Prases the passed command (start / stop / restart) and initiates daemonization """ # Makes sure the command is relevant if self.cmdline and cmd not in ['start', 'stop', 'restart']: print('Invalid command: %s' % cmd) print('') print('USAGE: python %s %s' % (sys.argv[0], self.usage_message)) sys.exit(1) elif cmd == 'start': self.start() elif cmd == 'stop': self.stop() elif cmd == 'restart': self.restart() else: print('USAGE: python %s %s' % (sys.argv[0], self.usage_message)) if self.cmdline: sys.exit(1) def start(self): """ Method that starts the daemon process """ print('Initializing the STACK daemon: %s' % self.process_name) # Sets flags for given process resp = '' if self.process == 'collect': resp = self.db.set_collector_status(self.project_id, self.collector_id, collector_status=1) elif self.process == 'process': resp = self.db.set_network_status(self.project_id, self.module, run=1, process=True) elif self.process == 'insert': resp = self.db.set_network_status(self.project_id, self.module, run=1, insert=True) if 'status' in resp and resp['status']: print('Flags set.') # Check to see if running based on pidfile pid = self.get_pid() if pid: message = "pidfile %s already exists. Is it already running?\n" sys.stderr.write(message % self.pidfile) sys.exit(1) # Start the daemon self.daemonize() self.run() else: print('Failed to successfully set flags, try again.') def stop(self): """ Method that sets flags and stops the daemon process """ print('Stop command received.') print('Step 1) Setting flags on the STACK process to stop.') if self.process == 'collect': # Set flags for the STACK process to stop resp = self.db.set_collector_status(self.project_id, self.collector_id, collector_status=0) # Grab active flag from collector's Mongo document collector_conf = self.projectdb.find_one({'_id': ObjectId(self.collector_id)}) active = collector_conf['active'] else: module_conf = self.projectdb.find_one({'module': self.module}) if self.process == 'process': resp = self.db.set_network_status(self.project_id, self.module, run=0, process=True) active = module_conf['processor_active'] else: resp = self.db.set_network_status(self.project_id, self.module, run=0, insert=True) active = module_conf['inserter_active'] # TODO - mongo error handling if resp['status']: print('Step 1 complete.') # If the daemon has already stopped, then set flags and break pid = self.get_pid() if not pid: print("STACK daemon already terminated.") # Extra clean up, just in case if os.path.exists(self.pidfile): os.remove(self.pidfile) if self.process in ['process', 'insert']: if self.process == 'process': self.projectdb.update({'module': self.module}, {'$set': {'processor_active': 0}}) else: self.projectdb.update({'module': self.module}, {'$set': {'inserter_active': 0}}) else: self.projectdb.update({'_id': ObjectId(self.collector_id)}, {'$set': {'active': 0}}) return # Step 2) Check for task / STACK process completion; loops through 15 times to check print('Step 2) Check for STACK process completion and shutdown the daemon.') wait_count = 0 while active == 1: wait_count += 1 if self.process in ['process', 'insert']: module_conf = self.projectdb.find_one({'module': self.module}) if self.process == 'process': active = module_conf['processor_active'] else: active = module_conf['inserter_active'] else: collector_conf = self.projectdb.find_one({'_id': ObjectId(self.collector_id)}) active = collector_conf['active'] print('Try %d / 15' % wait_count) print('Active Status: %d' % active) print('Trying again in 5 seconds.') print('') if wait_count > 15: break time.sleep(5) # Get the pid from the pidfile pid = self.get_pid() if not pid: print("Daemon successfully stopped via thread termination.") # Just to be sure. A ValueError might occur if the PID file is # empty but does actually exist if os.path.exists(self.pidfile): os.remove(self.pidfile) return # Not an error in a restart # Try killing the daemon process print('Daemon still running w/ loose thread. Stopping now...') try: i = 0 while 1: os.kill(pid, signal.SIGTERM) time.sleep(0.1) i = i + 1 if i % 10 == 0: os.kill(pid, signal.SIGHUP) except OSError as err: err = str(err) if err.find("No such process") > 0: if os.path.exists(self.pidfile): os.remove(self.pidfile) else: print(str(err)) sys.exit(1) # Had to kill the daemon, so set the active status flag accordingly. if self.process in ['process', 'insert']: if self.process == 'process': self.projectdb.update({'module': self.module}, {'$set': {'processor_active': 0}}) else: self.projectdb.update({'module': self.module}, {'$set': {'inserter_active': 0}}) else: self.projectdb.update({'_id': ObjectId(self.collector_id)}, {'$set': {'active': 0}}) print('Stopped.') def restart(self): """ Simple restart of the daemon """ # TODO - restart w/out shutting down daemon as part of extensible processor modules self.stop() self.start() def run(self): """ Calls the process logic scripts and runs """ # Backwards compatibility for older Twitter scripts if self.module == 'twitter': if self.process == 'collect': ThreadedCollector.go(self.api, self.project_id, self.collector_id, self.rawdir, self.logdir) elif self.process == 'process': preprocess.go(self.project_id, self.rawdir, self.archdir, self.insertdir, self.logdir) elif self.process == 'insert': mongoBatchInsert.go(self.project_id, self.rawdir, self.insertdir, self.logdir) # New approach via extensible collectors else: # Dynamically import collect from os.chdir(app.config['BASEDIR']) if self.process == 'collect': _temp = __import__('app.%s.collect' % self.module, globals(), locals(), ['Collector'], -1) Collector = _temp.Collector c = Collector(self.project_id, self.collector_id, self.process_name) c.go() elif self.process == 'process': _temp = __import__('app.%s.process' % self.module, globals(), locals(), ['Processor'], -1) Processor = _temp.Processor c = Processor(self.project_id, self.process_name, self.module) c.go() elif self.process == 'insert': _temp = __import__('app.%s.insert' % self.module, globals(), locals(), ['Inserter'], -1) Inserter = _temp.Inserter c = Inserter(self.project_id, self.process_name, self.module) c.go() def daemonize(self): """ Do the UNIX double-fork magic, see Stevens' "Advanced Programming in the UNIX Environment" for details (ISBN 0201563177) http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16 """ try: pid = os.fork() if pid > 0: # Exit first parent sys.exit(0) except OSError as e: sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror)) sys.exit(1) # Decouple from parent environment os.chdir(self.home_dir) os.setsid() os.umask(self.umask) # Do second fork try: pid = os.fork() if pid > 0: # Exit from second parent sys.exit(0) except OSError as e: sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror)) sys.exit(1) sys.stdout.flush() sys.stderr.flush() si = open(self.stdin, 'r+') so = open(self.stdout, 'a+') if self.stderr: se = open(self.stderr, 'a+') else: se = so if self.cmdline: os.dup2(si.fileno(), sys.stdin.fileno()) os.dup2(so.fileno(), sys.stdout.fileno()) os.dup2(se.fileno(), sys.stderr.fileno()) sys.stderr.flush() sys.stdout.flush() def sigtermhandler(signum, frame): self.daemon_alive = False signal.signal(signal.SIGTERM, sigtermhandler) signal.signal(signal.SIGINT, sigtermhandler) if self.verbose >= 1: print("Started") # Write pidfile atexit.register( self.delpid) # Make sure pid file is removed if we quit pid = str(os.getpid()) open(self.pidfile, 'w+').write("%s\n" % pid) def delpid(self): os.remove(self.pidfile) def get_pid(self): try: pf = open(self.pidfile, 'r') pid = int(pf.read().strip()) pf.close() except IOError: pid = None except SystemExit: pid = None return pid
class BaseCollector(object): """ Extensible base class for all STACK collectors """ def __init__(self, project_id, collector_id, process_name): self.project_id = project_id self.collector_id = collector_id self.process_name = process_name self.collecting_data = False # Sets up connection w/ project config DB & loads in collector info self.db = DB() project = self.db.get_project_detail(self.project_id) if project['status']: self.project_name = project['project_name'] configdb = project['project_config_db'] project_db = self.db.connection[configdb] self.project_db = project_db.config resp = self.db.get_collector_detail(self.project_id, self.collector_id) if resp['status']: collector_info = resp['collector'] # Load in collector info self.collector_name = collector_info['collector_name'] self.network = collector_info['network'] self.api = collector_info['api'] self.collection_type = collector_info['collection_type'] self.params = collector_info['params'] self.terms_list = collector_info['terms_list'] self.languages = collector_info['languages'] self.locations = collector_info['location'] self.auth = collector_info['api_auth'] # TODO - file format to Mongo # TODO - less then hour = warning self.file_format = '%Y%m%d-%H' # If this is a streaming collector if self.collection_type == 'realtime': self.project_db.update({'_id': ObjectId(self.collector_id)}, {'$set': { 'stream_limits': [] }}) # Sets up logdir and logging logdir = app.config[ 'LOGDIR'] + '/' + self.project_name + '-' + self.project_id + '/logs' if not os.path.exists(logdir): os.makedirs(logdir) # Sets logger w/ name collector_name and level INFO self.logger = logging.getLogger(self.collector_name) self.logger.setLevel(logging.INFO) # Sets up logging file handler logfile = logdir + '/%s.log' % self.process_name # TODO - logging params # TODO - port logging rotation params to Mongo for user control later / these default values good handler = logging.handlers.TimedRotatingFileHandler(logfile, when='D', backupCount=30) handler.setLevel(logging.INFO) # Formats format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' dateformat = '%m-%d %H:%M' formatter = logging.Formatter(format, dateformat) handler.setFormatter(formatter) # Adds handler to logger to finish self.logger.addHandler(handler) self.log('STACK collector %s initiated.' % self.collector_name) # Sets up rawdir self.rawdir = app.config[ 'DATADIR'] + '/' + self.project_name + '-' + self.project_id + '/' + self.network + '/raw' if not os.path.exists(self.rawdir): os.makedirs(self.rawdir) self.log( 'All raw files and directories set. Now starting collector...') def go(self): """ Starts and maintains the loop that monitors the collection thread. Threads are maintained in the extended versions of the class """ # Checks if we're supposed to be running self.run_flag = self.check_flags()['run'] self.collect_flag = 0 self.update_flag = 0 if self.run_flag: self.log('Starting Facebook collector %s with signal %d' % (self.process_name, self.run_flag)) self.set_active(1) # If run_flag is set - begin the loop while self.run_flag: try: flags = self.check_flags() self.run_flag = flags['run'] self.collect_flag = flags['collect'] self.update_flag = flags['update'] except Exception as e: self.log('Mongo connection refused with exception: %s' % e, level='warn') # If we've been flagged to stop or update and we're collecting - shut it down if self.collecting_data and (self.update_flag or not self.collect_flag or not self.run_flag): self.stop_thread() # If we've been flagged to start and we're not collecting - start it up if self.collect_flag and threading.activeCount() == 1: self.start_thread() time.sleep(2) self.log('Exiting Facebook collection.') self.set_active(0) def write(self, data): """ Called to write raw data to raw file - handles rotation """ timestr = time.strftime(self.file_format) filename = self.rawdir + '/' + timestr + '-' + self.collector_name + '-' + self.collector_id + '-out.json' if not os.path.isfile(filename): self.log('Creating new raw file: %s' % filename) with open(filename, 'a') as rawfile: rawfile.write(json.dumps(data).encode('utf-8')) rawfile.write('\n') def log(self, message, level='info', thread='MAIN:'): """ Logs messages to process logfile """ message = str(message) if level == 'warn': self.logger.warning(thread + ' ' + message) elif level == 'error': self.logger.error(thread + ' ' + message) else: self.logger.info(thread + ' ' + message) def check_flags(self): """ Quick method to grab and return all Mongo flags for given Collector instance """ resp = self.db.get_collector_detail(self.project_id, self.collector_id) collector = resp['collector'] return { 'run': collector['collector']['run'], 'collect': collector['collector']['collect'], 'update': collector['collector']['update'], 'active': collector['active'] } def set_active(self, active): """ Quick method to set the active flag to 1 or 0 """ self.project_db.update({'_id': ObjectId(self.collector_id)}, {'$set': { 'active': active }}) def start_thread(self): """ Modify this method when extending the class to manage the actual collection thread """ def stop_thread(self): """
resp = db.get_collector_ids(project_id) print json.dumps(resp, indent=1) elif method == 'get_project_detail': """ python __main__.py db get_project_detail project_id """ project_id = sys.argv[3] resp = db.get_project_detail(project_id) print json.dumps(resp, indent=1) elif method == 'get_collector_detail': """ python __main__.py db get_collector_detail project_id collector_id """ project_id = sys.argv[3] collector_id = sys.argv[4] resp = db.get_collector_detail(project_id, collector_id) print json.dumps(resp, indent=1) elif method == 'get_network_detail': """ python __main__.py db get_network_detail project_id network """ project_id = sys.argv[3] network = sys.argv[4] resp = db.get_network_detail(project_id, network) print json.dumps(resp, indent=1) elif method == 'set_collector_detail': """ python __main__.py db set_collector_detail INPUT FORMATTING
def update_collector(collector_id): """ Used to update a collector details form the front-end TODO - Terms & start / end dates """ db = DB() resp = db.get_collector_detail(g.project['project_id'], collector_id) collector = resp['collector'] # First, populate the main form w/ info form_params = {'collector_name': collector['collector_name']} if collector['network'] == 'twitter': form_params['api'] = collector['api'] form_params['consumer_key'] = collector['api_auth']['consumer_key'] form_params['consumer_secret'] = collector['api_auth'][ 'consumer_secret'] form_params['access_token'] = collector['api_auth']['access_token'] form_params['access_token_secret'] = collector['api_auth'][ 'access_token_secret'] if collector['languages']: languages = '\r\n'.join(collector['languages']) form_params['languages'] = languages if collector['location']: loc_string = '' r = 1 c = 1 for loc in collector['location']: if c == 1 and r > 1: loc_string = loc_string + '\r\n' + loc + ',' else: if c != 4: loc_string = loc_string + loc + ',' else: loc_string = loc_string + loc if c == 4: c = 1 r += 1 else: c += 1 form_params['locations'] = loc_string elif collector['network'] == 'facebook': form_params['collection_type'] = collector['collection_type'] form_params['client_id'] = collector['api_auth']['client_id'] form_params['client_secret'] = collector['api_auth']['client_secret'] # TODO - start & end dates form = UpdateCollectorForm(**form_params) # Next, create a form for each term -- if no terms, one form is needed & it's empty terms = collector['terms_list'] terms_forms = [] if terms: for term in terms: # Load form & set defaults form_params = { 'term': term['term'], 'collect': int(term['collect']), } tform = UpdateCollectorTermsForm(prefx=term['term'], **form_params) terms_forms.append(tform) # Finally, update on submission if request.method == 'POST': # Loads in the data from the form & sets initial param dict form_data = request.get_json() params = {} if form_data['collector_name'] != collector['collector_name']: params['collector_name'] = form_data['collector_name'] # Twitter data if collector['network'] == 'twitter': if form_data['api'] != collector['api']: params['api'] = form_data['api'] # If one auth param is updated, assume all are if form_data['consumer_key'] != collector['api_auth'][ 'consumer_key']: params['api_auth'] = { 'consumer_key': form_data['consumer_key'], 'consumer_secret': form_data['consumer_secret'], 'access_token': form_data['access_token'], 'access_token_secret': form_data['access_token_secret'] } languages = form_data['languages'] if not languages or languages == '': languages = None else: languages = languages.split('\r\n') if languages != collector['languages']: params['languages'] = languages locations = form_data['locations'] if not locations or languages == '': locations = None else: locations = locations.replace('\r\n', ',').split(',') if len(locations) % 4 is not 0: flash( 'Location coordinates should be entered in pairs of 4. Please try again' ) return redirect( url_for('update_collector', collector_id=collector_id)) if locations != collector['location']: params['location'] = locations # Facebook Data elif collector['network'] == 'facebook': if form_data['collection_type'] != collector['collection_type']: params['collection_type'] = form.collection_type.data if form_data['client_id'] != collector['api_auth']['client_id']: params['api_auth'] = { 'client_id': form_data['client_id'], 'client_secret': form_data['client_secret'] } # TODO - start and end dates # Final terms dict params['terms_list'] = [] # Term type value if collector['network'] == 'twitter': if collector['api'] == 'follow': term_type = 'handle' else: term_type = 'term' else: term_type = 'page' # New terms (if any) terms = form_data['new_terms'] if terms and terms != '': terms = terms.split('\r\n') for t in terms: params['terms_list'].append({ 'term': t, 'collect': 1, 'type': term_type, 'id': None }) # Updated terms (if any) current_terms = form_data['terms'] while current_terms: params['terms_list'].append({ 'term': current_terms.pop(0), 'collect': int(current_terms.pop(0)), 'type': term_type, 'id': None }) # Now, try updating resp = db.update_collector_detail(g.project['project_id'], collector_id, **params) if resp['status']: flash('Collector updated successfully!') return redirect( url_for('collector', project_name=g.project['project_name'], network=collector['network'], collector_id=collector_id)) else: flash(resp['message']) return redirect( url_for('update_collector', collector_id=collector_id)) return render_template('update_collector.html', collector=collector, form=form, terms_forms=terms_forms)
resp = db.get_collector_ids(project_id) print json.dumps(resp, indent=1) elif method == 'get_project_detail': """ python __main__.py db get_project_detail project_id """ project_id = sys.argv[3] resp = db.get_project_detail(project_id) print json.dumps(resp, indent=1) elif method == 'get_collector_detail': """ python __main__.py db get_collector_detail project_id collector_id """ project_id = sys.argv[3] collector_id = sys.argv[4] resp = db.get_collector_detail(project_id, collector_id) print json.dumps(resp, indent=1) elif method == 'get_network_detail': """ python __main__.py db get_network_detail project_id network """ project_id = sys.argv[3] network = sys.argv[4] resp = db.get_network_detail(project_id, network) print json.dumps(resp, indent=1) elif method == 'set_collector_detail': """ python __main__.py db set_collector_detail INPUT FORMATTING