示例#1
0
def customURLClean(qs_parts):
	#return qs_parts #do nothing
	latlonstore = None
	lstore = None #handles mobile location input
	for Key in qs_parts:
		cskey = Key.lower()
		if (cskey in "lineref") or (cskey in "monitoringref"):
			parts = qs_parts[Key].split('_')
			qs_parts[Key] = parts[len(parts)-1]
		if (cskey in "lat" and "lat" in cskey) or (cskey in "lon" and "lon" in cskey):
			if latlonstore is None:
				latlonstore = Key
			else:
				if ("lat" in cskey):
					lat = qs_parts[Key]
					lon = qs_parts[latlonstore]
				else:
					lon = qs_parts[Key]
					lat = qs_parts[latlonstore]
				latlonstore = "%s_%s"%(lat,lon)
		if (cskey in "l") and ("l" in cskey):
			lstore = qs_parts[Key].replace(",", "_")
		
	if latlonstore is not None:
		qs_parts["CustAWStatsLocation"] = latlonstore
	elif lstore is not None:	
		qs_parts["CustAWStatsLocation"] = lstore
		
	return qs_parts
示例#2
0
def main():
    logger.info('SA_Worker: initializing sql...')
    sql_q.init()
    logger.info('SA_Worker: initializing SA_Mapper...')
    SA_Mapper.setup_SA()
    while True:
        logger.info('SA_Worker: Getting number of tweets that need SA...')
        num_todo = _get_num_tweets_need_sa()
        logger.info("SA_Worker: num_todo: %s", num_todo)
        last_check = time.time()
        if num_todo >= EMR_THRESHOLD and ALLOW_EMR:
            logger.info("SA_Worker: Using Hadoop to do SA")
            # There's a lot of Tweets to analyze, so spin up
            # a EMR job to tackle them one BATCH_SIZE at a time

            #Get a connection to S3
            s3_conn = boto.connect_s3()

            #Grab the reTOracle bucket & key
            bucket = s3_conn.get_bucket('retoracle')
            key = Key(bucket)

            #Cleanup any input or output lingering from previous jobs
            cleanup(bucket)

            #Create and upload the data and scripts that the EMR job needs
            #everything up to MAX_BATCH_SIZE
            create_inputs(key, min(num_todo, MAX_BATCH_SIZE))

            #Create and start the EMR job
            emr_conn = boto.emr.connect_to_region('us-west-2')
            jobid = _create_sa_job(emr_conn)
            print "Started EMR Job", jobid

            #Wait for the EMR job to complete
            _wait_for_job_to_complete(emr_conn, jobid)

            #EMR Job is done, so get SA results and push to SQL
            push_sa_results_to_sql_from_s3(bucket)

        else:
            logger.info("SA_Worker: Doing SA locally")
            #Process Tweets using this worker (not EMR)

            #Get the remaining Tweets that need SA
            logger.info("SA_Worker: Getting Tweet batch...")
            tweet_batch = json.loads(
                sql_q.get_query_results(
                    'tweet_batch',
                    [min(num_todo, MAX_BATCH_SIZE)]))

            #Run SA on each Tweet and then upload its results to SQL
            count = 0
            current_batch = []
            total = len(tweet_batch)
            logger.info("SA_Worker: Running SA on each Tweet...")
            for tweet in tweet_batch:
                #do SA magics locally
                result_dict = SA_Mapper.run_SA(tweet)
                neg_probs = []
                pos_probs = []
                for key in result_dict:
                    if "_neg_" in key.lower():
                        neg_probs.append(result_dict[key])
                    elif "_pos_" in key.lower():
                        pos_probs.append(result_dict[key])
                agg_sent_result = agg_sent.get_agg_sent(neg_probs, pos_probs)
                result_dict['agg_sent'] = agg_sent_result[0]
                result_dict['agg_prob'] = agg_sent_result[1]
                current_batch.append(result_dict)
                count += 1
                if len(current_batch) >= SQL_INSERT_BATCH_SIZE or \
                    count == total:
                    delicious_payload = json.dumps(current_batch)
                    logger.debug("Inserting into SQL: %s", delicious_payload.lower())
                    sql_q.get_query_results(
                        'set_tweet_sent',
                        [delicious_payload.lower()],
                        False)
                    logger.info("Inserted %s of %s into SQL", count, total)
                    current_batch = []

        #Wait a short while (if needed) before checking for more Tweets
        time_spent = time.time() - last_check
        if time_spent < MIN_EXECUTION_PERIOD:
            time.sleep(MIN_EXECUTION_PERIOD - time_spent)
示例#3
0
def main():
    logger.info('SA_Worker: initializing sql...')
    sql_q.init()
    logger.info('SA_Worker: initializing SA_Mapper...')
    SA_Mapper.setup_SA()
    while True:
        logger.info('SA_Worker: Getting number of tweets that need SA...')
        num_todo = _get_num_tweets_need_sa()
        logger.info("SA_Worker: num_todo: %s", num_todo)
        last_check = time.time()
        if num_todo >= EMR_THRESHOLD and ALLOW_EMR:
            logger.info("SA_Worker: Using Hadoop to do SA")
            # There's a lot of Tweets to analyze, so spin up
            # a EMR job to tackle them one BATCH_SIZE at a time

            #Get a connection to S3
            s3_conn = boto.connect_s3()

            #Grab the reTOracle bucket & key
            bucket = s3_conn.get_bucket('retoracle')
            key = Key(bucket)

            #Cleanup any input or output lingering from previous jobs
            cleanup(bucket)

            #Create and upload the data and scripts that the EMR job needs
            #everything up to MAX_BATCH_SIZE
            create_inputs(key, min(num_todo, MAX_BATCH_SIZE))

            #Create and start the EMR job
            emr_conn = boto.emr.connect_to_region('us-west-2')
            jobid = _create_sa_job(emr_conn)
            print "Started EMR Job", jobid

            #Wait for the EMR job to complete
            _wait_for_job_to_complete(emr_conn, jobid)

            #EMR Job is done, so get SA results and push to SQL
            push_sa_results_to_sql_from_s3(bucket)

        else:
            logger.info("SA_Worker: Doing SA locally")
            #Process Tweets using this worker (not EMR)

            #Get the remaining Tweets that need SA
            logger.info("SA_Worker: Getting Tweet batch...")
            tweet_batch = json.loads(
                sql_q.get_query_results('tweet_batch',
                                        [min(num_todo, MAX_BATCH_SIZE)]))

            #Run SA on each Tweet and then upload its results to SQL
            count = 0
            current_batch = []
            total = len(tweet_batch)
            logger.info("SA_Worker: Running SA on each Tweet...")
            for tweet in tweet_batch:
                #do SA magics locally
                result_dict = SA_Mapper.run_SA(tweet)
                neg_probs = []
                pos_probs = []
                for key in result_dict:
                    if "_neg_" in key.lower():
                        neg_probs.append(result_dict[key])
                    elif "_pos_" in key.lower():
                        pos_probs.append(result_dict[key])
                agg_sent_result = agg_sent.get_agg_sent(neg_probs, pos_probs)
                result_dict['agg_sent'] = agg_sent_result[0]
                result_dict['agg_prob'] = agg_sent_result[1]
                current_batch.append(result_dict)
                count += 1
                if len(current_batch) >= SQL_INSERT_BATCH_SIZE or \
                    count == total:
                    delicious_payload = json.dumps(current_batch)
                    logger.debug("Inserting into SQL: %s",
                                 delicious_payload.lower())
                    sql_q.get_query_results('set_tweet_sent',
                                            [delicious_payload.lower()], False)
                    logger.info("Inserted %s of %s into SQL", count, total)
                    current_batch = []

        #Wait a short while (if needed) before checking for more Tweets
        time_spent = time.time() - last_check
        if time_spent < MIN_EXECUTION_PERIOD:
            time.sleep(MIN_EXECUTION_PERIOD - time_spent)