f_out = open(processed_tweets_file,'w') tweets_list = [] tweet_total = 0 lost_tweets = 0 line_number = 0 with open(rawTweetsFile) as f: for line in f: try: line_number += 1 line = line.strip() tweet_out_string = tweetprocessing.process_tweet(line, track_list, expand_url=EXPAND_URLS) f_out.write(tweet_out_string) tweet_total += 1 # print tweet_out_string except ValueError, e: lost_tweets += 1 print "ValueError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile) logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile)) logging.exception(e) print traceback.format_exc() pass except TypeError, e: lost_tweets += 1 print "TypeError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile) logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile))
def go(project_id, rawdir, archdir, insertdir, logdir): # Connects to project account DB project = db.get_project_detail(project_id) project_name = project['project_name'] configdb = project['project_config_db'] conn = db.connection[configdb] project_config_db = conn.config # Reference for controller if script is active or not. project_config_db.update({'module': 'twitter'}, {'$set': { 'processor_active': 1 }}) Config = ConfigParser.ConfigParser() Config.read(PLATFORM_CONFIG_FILE) # Creates logger w/ level INFO logger = logging.getLogger('preprocess') logger.setLevel(logging.INFO) # Creates rotating file handler w/ level INFO fh = logging.handlers.TimedRotatingFileHandler( logdir + '/' + project_name + '-processor-log-' + project_id + '.out', 'D', 1, 30, None, False, False) fh.setLevel(logging.INFO) # Creates formatter and applies to rotating handler format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' datefmt = '%m-%d %H:%M' formatter = logging.Formatter(format, datefmt) fh.setFormatter(formatter) # Finishes by adding the rotating, formatted handler logger.addHandler(fh) logger = logging.getLogger('preprocess') logger.info('Starting preprocess system') if not os.path.exists(rawdir + '/error_tweets/'): os.makedirs(rawdir + '/error_tweets/') error_tweet = open( rawdir + '/error_tweets/error_tweet-' + project_name + '-' + project_id + '.txt', 'a') module_config = project_config_db.find_one({'module': 'twitter'}) runPreProcessor = module_config['processor']['run'] if runPreProcessor: print 'Starting runPreProcessor' logger.info('Preprocess start signal') runLoopSleep = 0 while runPreProcessor: # Get all terms for all collectors track_list = [] for collector in project['collectors']: if collector['terms_list']: tmp_terms = [term['term'] for term in collector['terms_list']] track_list += tmp_terms if track_list: track_list = list(set(track_list)) tweetsFileList = get_tweet_file_queue(Config, rawdir) files_in_queue = len(tweetsFileList) if files_in_queue < 1: time.sleep(180) else: logger.info('Queue length is %d' % files_in_queue) rawTweetsFile = tweetsFileList[0] logger.info('Preprocess raw file: %s' % rawTweetsFile) processed_tweets_file = get_processed_tweets_file_name( Config, rawTweetsFile, rawdir, archdir) # TODO - Dynamic copy time # lame workaround, but for now we assume it will take less than a minute to # copy a file so this next sleep is here to wait for a copy to finish on the # off chance that we happy to see it just as it is being copied to the directory time.sleep(60) f_out = open(processed_tweets_file, 'w') tweets_list = [] tweet_total = 0 lost_tweets = 0 line_number = 0 with open(rawTweetsFile) as f: if '-delete-' not in rawTweetsFile and '-streamlimits-' not in rawTweetsFile: for line in f: try: line_number += 1 line = line.strip() tweet_out_string = tweetprocessing.process_tweet( line, track_list, expand_url=EXPAND_URLS) f_out.write(tweet_out_string) tweet_total += 1 # print tweet_out_string except ValueError, e: lost_tweets += 1 print "ValueError. tweet not processed: %d (%s)" % ( line_number, rawTweetsFile) logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile)) logging.exception(e) error_tweet.write(line + "\n") print traceback.format_exc() pass except TypeError, e: lost_tweets += 1 print "TypeError. tweet not processed: %d (%s)" % ( line_number, rawTweetsFile) logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile)) logging.exception(e) error_tweet.write(line + "\n") print traceback.format_exc() pass except KeyError, e: lost_tweets += 1 print "KeyError. tweet not processed: %d (%s)" % ( line_number, rawTweetsFile) logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile)) logging.exception(e) error_tweet.write(line + "\n") print traceback.format_exc() pass elif '-streamlimits-' in rawTweetsFile: server_name = os.uname()[1] try: collector_id = rawTweetsFile.split('-')[6] collector = db.get_collector_detail( project_id=project_id, collector_id=collector_id) col_type = collector['collector']['api'] except: col_type = 'UNDEFINED' for line in f: line = line.strip() limit_out_string = tweetprocessing.process_limit( line, col_type, server_name, project_name, project_id, collector_id) f_out.write(limit_out_string)
def go(project_id, rawdir, archdir, insertdir, logdir): # Connects to project account DB project = db.get_project_detail(project_id) project_name = project['project_name'] configdb = project['project_config_db'] conn = db.connection[configdb] project_config_db = conn.config # Reference for controller if script is active or not. project_config_db.update({'module': 'twitter'}, {'$set': {'processor_active': 1}}) Config = ConfigParser.ConfigParser() Config.read(PLATFORM_CONFIG_FILE) # Creates logger w/ level INFO logger = logging.getLogger('preprocess') logger.setLevel(logging.INFO) # Creates rotating file handler w/ level INFO fh = logging.handlers.TimedRotatingFileHandler(logdir + '/' + project_name + '-processor-log-' + project_id + '.out', 'D', 1, 30, None, False, False) fh.setLevel(logging.INFO) # Creates formatter and applies to rotating handler format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' datefmt = '%m-%d %H:%M' formatter = logging.Formatter(format, datefmt) fh.setFormatter(formatter) # Finishes by adding the rotating, formatted handler logger.addHandler(fh) logger = logging.getLogger('preprocess') logger.info('Starting preprocess system') if not os.path.exists(rawdir + '/error_tweets/'): os.makedirs(rawdir + '/error_tweets/') error_tweet = open(rawdir + '/error_tweets/error_tweet-' + project_name + '-' + project_id + '.txt', 'a') module_config = project_config_db.find_one({'module': 'twitter'}) runPreProcessor = module_config['processor']['run'] if runPreProcessor: print 'Starting runPreProcessor' logger.info('Preprocess start signal') runLoopSleep = 0 while runPreProcessor: # Get all terms for all collectors track_list = [] for collector in project['collectors']: if collector['terms_list']: tmp_terms = [term['term'] for term in collector['terms_list']] track_list += tmp_terms if track_list: track_list = list(set(track_list)) tweetsFileList = get_tweet_file_queue(Config, rawdir) files_in_queue = len(tweetsFileList) if files_in_queue < 1: time.sleep( 180 ) else: logger.info('Queue length is %d' % files_in_queue) rawTweetsFile = tweetsFileList[0] logger.info('Preprocess raw file: %s' % rawTweetsFile) processed_tweets_file = get_processed_tweets_file_name(Config, rawTweetsFile, rawdir, archdir) # TODO - Dynamic copy time # lame workaround, but for now we assume it will take less than a minute to # copy a file so this next sleep is here to wait for a copy to finish on the # off chance that we happy to see it just as it is being copied to the directory time.sleep( 60 ) f_out = open(processed_tweets_file,'w') tweets_list = [] tweet_total = 0 lost_tweets = 0 line_number = 0 with open(rawTweetsFile) as f: if '-delete-' not in rawTweetsFile and '-streamlimits-' not in rawTweetsFile: for line in f: try: line_number += 1 line = line.strip() tweet_out_string = tweetprocessing.process_tweet(line, track_list, expand_url=EXPAND_URLS) f_out.write(tweet_out_string) tweet_total += 1 # print tweet_out_string except ValueError, e: lost_tweets += 1 print "ValueError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile) logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile)) logging.exception(e) error_tweet.write(line+"\n") print traceback.format_exc() pass except TypeError, e: lost_tweets += 1 print "TypeError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile) logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile)) logging.exception(e) error_tweet.write(line+"\n") print traceback.format_exc() pass except KeyError, e: lost_tweets += 1 print "KeyError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile) logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile)) logging.exception(e) error_tweet.write(line+"\n") print traceback.format_exc() pass elif '-streamlimits-' in rawTweetsFile: server_name = os.uname()[1] try: collector_id = rawTweetsFile.split('-')[6] collector = db.get_collector_detail(project_id=project_id, collector_id=collector_id) col_type = collector['collector']['api'] except: col_type = 'UNDEFINED' for line in f: line = line.strip() limit_out_string = tweetprocessing.process_limit(line, col_type, server_name, project_name, project_id, collector_id) f_out.write(limit_out_string)
f_out = open(processed_tweets_file,'w') tweets_list = [] tweet_total = 0 lost_tweets = 0 line_number = 0 with open(rawTweetsFile) as f: for line in f: try: line_number += 1 line = line.strip() tweet_out_string = tweetprocessing.process_tweet(line, track_list, expand_url=EXPAND_URLS) f_out.write(tweet_out_string) tweet_total += 1 # print tweet_out_string except ValueError, e: lost_tweets += 1 print "ValueError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile) logger.warning("tweet not processed: %d (%s)" % (line_number, rawTweetsFile)) logging.exception(e) error_tweet.write(line+"\n") print traceback.format_exc() pass except TypeError, e: lost_tweets += 1 print "TypeError. tweet not processed: %d (%s)" % (line_number, rawTweetsFile)