def customURLClean(qs_parts): #return qs_parts #do nothing latlonstore = None lstore = None #handles mobile location input for Key in qs_parts: cskey = Key.lower() if (cskey in "lineref") or (cskey in "monitoringref"): parts = qs_parts[Key].split('_') qs_parts[Key] = parts[len(parts)-1] if (cskey in "lat" and "lat" in cskey) or (cskey in "lon" and "lon" in cskey): if latlonstore is None: latlonstore = Key else: if ("lat" in cskey): lat = qs_parts[Key] lon = qs_parts[latlonstore] else: lon = qs_parts[Key] lat = qs_parts[latlonstore] latlonstore = "%s_%s"%(lat,lon) if (cskey in "l") and ("l" in cskey): lstore = qs_parts[Key].replace(",", "_") if latlonstore is not None: qs_parts["CustAWStatsLocation"] = latlonstore elif lstore is not None: qs_parts["CustAWStatsLocation"] = lstore return qs_parts
def main(): logger.info('SA_Worker: initializing sql...') sql_q.init() logger.info('SA_Worker: initializing SA_Mapper...') SA_Mapper.setup_SA() while True: logger.info('SA_Worker: Getting number of tweets that need SA...') num_todo = _get_num_tweets_need_sa() logger.info("SA_Worker: num_todo: %s", num_todo) last_check = time.time() if num_todo >= EMR_THRESHOLD and ALLOW_EMR: logger.info("SA_Worker: Using Hadoop to do SA") # There's a lot of Tweets to analyze, so spin up # a EMR job to tackle them one BATCH_SIZE at a time #Get a connection to S3 s3_conn = boto.connect_s3() #Grab the reTOracle bucket & key bucket = s3_conn.get_bucket('retoracle') key = Key(bucket) #Cleanup any input or output lingering from previous jobs cleanup(bucket) #Create and upload the data and scripts that the EMR job needs #everything up to MAX_BATCH_SIZE create_inputs(key, min(num_todo, MAX_BATCH_SIZE)) #Create and start the EMR job emr_conn = boto.emr.connect_to_region('us-west-2') jobid = _create_sa_job(emr_conn) print "Started EMR Job", jobid #Wait for the EMR job to complete _wait_for_job_to_complete(emr_conn, jobid) #EMR Job is done, so get SA results and push to SQL push_sa_results_to_sql_from_s3(bucket) else: logger.info("SA_Worker: Doing SA locally") #Process Tweets using this worker (not EMR) #Get the remaining Tweets that need SA logger.info("SA_Worker: Getting Tweet batch...") tweet_batch = json.loads( sql_q.get_query_results( 'tweet_batch', [min(num_todo, MAX_BATCH_SIZE)])) #Run SA on each Tweet and then upload its results to SQL count = 0 current_batch = [] total = len(tweet_batch) logger.info("SA_Worker: Running SA on each Tweet...") for tweet in tweet_batch: #do SA magics locally result_dict = SA_Mapper.run_SA(tweet) neg_probs = [] pos_probs = [] for key in result_dict: if "_neg_" in key.lower(): neg_probs.append(result_dict[key]) elif "_pos_" in key.lower(): pos_probs.append(result_dict[key]) agg_sent_result = agg_sent.get_agg_sent(neg_probs, pos_probs) result_dict['agg_sent'] = agg_sent_result[0] result_dict['agg_prob'] = agg_sent_result[1] current_batch.append(result_dict) count += 1 if len(current_batch) >= SQL_INSERT_BATCH_SIZE or \ count == total: delicious_payload = json.dumps(current_batch) logger.debug("Inserting into SQL: %s", delicious_payload.lower()) sql_q.get_query_results( 'set_tweet_sent', [delicious_payload.lower()], False) logger.info("Inserted %s of %s into SQL", count, total) current_batch = [] #Wait a short while (if needed) before checking for more Tweets time_spent = time.time() - last_check if time_spent < MIN_EXECUTION_PERIOD: time.sleep(MIN_EXECUTION_PERIOD - time_spent)
def main(): logger.info('SA_Worker: initializing sql...') sql_q.init() logger.info('SA_Worker: initializing SA_Mapper...') SA_Mapper.setup_SA() while True: logger.info('SA_Worker: Getting number of tweets that need SA...') num_todo = _get_num_tweets_need_sa() logger.info("SA_Worker: num_todo: %s", num_todo) last_check = time.time() if num_todo >= EMR_THRESHOLD and ALLOW_EMR: logger.info("SA_Worker: Using Hadoop to do SA") # There's a lot of Tweets to analyze, so spin up # a EMR job to tackle them one BATCH_SIZE at a time #Get a connection to S3 s3_conn = boto.connect_s3() #Grab the reTOracle bucket & key bucket = s3_conn.get_bucket('retoracle') key = Key(bucket) #Cleanup any input or output lingering from previous jobs cleanup(bucket) #Create and upload the data and scripts that the EMR job needs #everything up to MAX_BATCH_SIZE create_inputs(key, min(num_todo, MAX_BATCH_SIZE)) #Create and start the EMR job emr_conn = boto.emr.connect_to_region('us-west-2') jobid = _create_sa_job(emr_conn) print "Started EMR Job", jobid #Wait for the EMR job to complete _wait_for_job_to_complete(emr_conn, jobid) #EMR Job is done, so get SA results and push to SQL push_sa_results_to_sql_from_s3(bucket) else: logger.info("SA_Worker: Doing SA locally") #Process Tweets using this worker (not EMR) #Get the remaining Tweets that need SA logger.info("SA_Worker: Getting Tweet batch...") tweet_batch = json.loads( sql_q.get_query_results('tweet_batch', [min(num_todo, MAX_BATCH_SIZE)])) #Run SA on each Tweet and then upload its results to SQL count = 0 current_batch = [] total = len(tweet_batch) logger.info("SA_Worker: Running SA on each Tweet...") for tweet in tweet_batch: #do SA magics locally result_dict = SA_Mapper.run_SA(tweet) neg_probs = [] pos_probs = [] for key in result_dict: if "_neg_" in key.lower(): neg_probs.append(result_dict[key]) elif "_pos_" in key.lower(): pos_probs.append(result_dict[key]) agg_sent_result = agg_sent.get_agg_sent(neg_probs, pos_probs) result_dict['agg_sent'] = agg_sent_result[0] result_dict['agg_prob'] = agg_sent_result[1] current_batch.append(result_dict) count += 1 if len(current_batch) >= SQL_INSERT_BATCH_SIZE or \ count == total: delicious_payload = json.dumps(current_batch) logger.debug("Inserting into SQL: %s", delicious_payload.lower()) sql_q.get_query_results('set_tweet_sent', [delicious_payload.lower()], False) logger.info("Inserted %s of %s into SQL", count, total) current_batch = [] #Wait a short while (if needed) before checking for more Tweets time_spent = time.time() - last_check if time_spent < MIN_EXECUTION_PERIOD: time.sleep(MIN_EXECUTION_PERIOD - time_spent)