def process_message(key, job): # get features: print 'FINDING SIMILARITY' # do the work to find similarity error = validate_job(job) if error is not None: print "Error in Job : {}".format(error) job['data'] = [] job['error'] = error job['state'] = 'error' return image_similarity = ImageSimilarity(float(job['similarity_threshold']), job['start_time_ms'], job['end_time_ms'], job['similarity_method']) query_params = [{ "query_type": "between", "property_name": "timestamp_ms", "query_value": [job['start_time_ms'], job['end_time_ms']] }] if 'lang' in job.keys(): query_params.append({ "query_type": "where", "property_name": "lang", "query_value": job['lang'] }) loopy = Loopy(job['query_url'], query_params) if loopy.result_count == 0: print "No data to process" job['data'] = [] job['error'] = "No data found to process." job['state'] = 'error' return while True: print "Scrolling...{}".format(loopy.current_page) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: if job['data_type'] == "text" and 'text_features' in doc and 'id' in doc and \ len(doc['text_features']) > 0: image_similarity.process_vector(doc['id'], doc['text_features']) continue if job['data_type'] == "image" and 'image_features' in doc and 'id' in doc and \ len(doc['image_features']) > 0: image_similarity.process_vector(doc['id'], doc['image_features']) clusters = image_similarity.get_clusters() print 'FINISHED SIMILARITY PROCESSING: found {} clusters'.format(len(clusters)) for cluster in clusters: cluster['job_monitor_id'] = job['job_id'] loopy.post_result(job['result_url'], cluster) job['data'] = image_similarity.to_json() job['state'] = 'processed'
def process_message(key, job): if 'type' in job and job['type'] == 'featurizer': job['state'] = 'processed' return print 'Checking Parameters' err_check(job) if job['state'] == 'error': return print 'FINDING SIMILARITY' hash_clust = HashtagClusters(float(job['min_post'])) query_params = [{ "query_type": "between", "property_name": "timestamp_ms", "query_value": [job['start_time_ms'], job['end_time_ms']] }, { "query_type": "neq", "property_name": "hashtags", "query_value": "null" }] if 'lang' in job.keys(): query_params.append({ "query_type": "where", "property_name": "lang", "query_value": job['lang'] }) loopy = Loopy(job['query_url'], query_params) if loopy.result_count == 0: print "No data to process" job['data'] = [] job['error'] = "No data found to process." job['state'] = 'error' return while True: print "Scrolling...{}".format(loopy.current_page) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: hash_clust.process_vector(doc['id'], doc['hashtags']) print 'FINISHED SIMILARITY PROCESSING' for k, v in hash_clust.hash_groups.iteritems(): cluster = {} cluster['term'] = k cluster['post_ids'] = v cluster['job_monitor_id'] = job['job_id'] loopy.post_result(job['result_url'], cluster) job['data'] = hash_clust.to_json() job['state'] = 'processed'
def process_message(key, job): err_check(job) if job['state'] == 'error': return query_url = os.getenv('QUERY_URL', job['query_url']) result_url = os.getenv('RESULT_URL', job['result_url']) query_params = [{ 'query_type': 'where', 'property_name': 'end_time_ms', 'query_value': job['end_time_ms'] }] print 'BEGIN LINKING CLUSTERS' linker = ClusterLinker(job.get('min_overlap', 0.6)) loopy = Loopy(query_url, query_params) if loopy.result_count == 0: print 'No data to process' job['data'] = [] job['error'] = 'No data found to process.' job['state'] = 'error' return while True: print 'Scrolling...{}'.format(loopy.current_page) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: linker.add_cluster(doc) print 'FINISHED LINKING CLUSTERS' for link in linker.get_links(): loopy.post_result(result_url, link) job['data'] = json.dumps({}) # no need to save anything to job job['state'] = 'processed'
def process_message(key, job): print 'FINDING SIMILARITY' error = validate_job(job) if error: print 'Error in Job : {}'.format(error) job['data'] = [] job['error'] = error job['state'] = 'error' return feature_similarity = FeatureSimilarity(float(job['similarity_threshold']), job['start_time_ms'], job['end_time_ms']) query_params = [{ 'query_type': 'between', 'property_name': 'timestamp_ms', 'query_value': [job['start_time_ms'], job['end_time_ms']], 'featurizer': job['data_type'] }] if 'lang' in job: query_params.append({ 'query_type': 'where', 'property_name': 'lang', 'query_value': job['lang'] }) loopy = Loopy(job['query_url'], query_params) if loopy.result_count == 0: print 'No data to process' job['data'] = [] job['error'] = 'No data found to process.' job['state'] = 'error' return while True: print 'Scrolling...{}'.format(loopy.total_returned) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: if job['data_type'] == 'text' and 'text_features' in doc and 'id' in doc and \ len(doc['text_features']) > 0: feature_similarity.process_vector(doc['id'], doc['post_id'], doc['text_features']) continue if job['data_type'] == 'image' and 'image_features' in doc and 'id' in doc and \ len(doc['image_features']) > 0 and 'primary_image_url' in doc and \ doc['primary_image_url']: feature_similarity.process_vector(doc['id'], doc['post_id'], doc['image_features'], doc['primary_image_url']) clusters = feature_similarity.get_clusters() for cluster in clusters: cluster['job_monitor_id'] = job['job_id'] cluster['data_type'] = job['data_type'] loopy.post_result(job['result_url'], cluster) job['data'] = feature_similarity.to_json() job['state'] = 'processed' print 'FINISHED SIMILARITY PROCESSING: found {} clusters'.format(len(clusters))
def process_message(key, job): print "absentfriends/main.py:process_message" + repr((key, job)) # if type == 'featurizer', immediately process and return b/c domains # are not featurized. allows system to continue with clustering process. if job.get('type') == 'featurizer': job['state'] = 'processed' job['data'] = [] return err_check(job) if job['state'] == 'error': return query_url = os.getenv('QUERY_URL', job['query_url']) result_url = os.getenv('RESULT_URL', job['result_url']) print 'FINDING SIMILARITY' print 'min_post set to %s' % job['min_post'] domain_clust = DomainClusters(float(job['min_post']), query_url, job['start_time_ms']) query_params = [{ "query_type": "between", "property_name": "timestamp_ms", "query_value": [job['start_time_ms'], job['end_time_ms']] }, { "query_type": "where", "property_name": "featurizer", "query_value": "domain" }] loopy = Loopy(query_url, query_params) if loopy.result_count == 0: print "No data to process" job['data'] = [] job['error'] = "No data found to process." job['state'] = 'error' return while True: print "Scrolling...{}".format(loopy.total_returned) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: domains = get_domains(doc['image_urls']) if len(domains) > 0: domain_clust.process_vector(doc['id'], doc['post_id'], domains) if int(os.getenv('TRUNCATE_POSTS') or 0): print 'Truncating posts...' print truncate_posts(feature_similarity.get_clusters_to_delete(), loopy) else: print 'Skipping truncate posts because TRUNCATE_POSTS env var is not set...' print 'FINISHED SIMILARITY PROCESSING' for k, v in domain_clust.get_clusters().iteritems(): cluster = {} cluster['id'] = str(uuid.uuid4()) cluster['term'] = k cluster['similar_ids'] = v['similar_ids'] cluster['similar_post_ids'] = v['similar_post_ids'] cluster['job_monitor_id'] = job['job_id'] cluster['start_time_ms'] = job['start_time_ms'] cluster['end_time_ms'] = job['end_time_ms'] cluster['stats'] = v['stats'] cluster['data_type'] = 'domain' try: loopy.post_result(result_url, cluster) except Exception as e: # TODO: we should set data = None when error. job['data'] = [] job['state'] = 'error' job['error'] = e break else: # no errors job['data'] = domain_clust.to_json() job['state'] = 'processed'
def process_message(key, job): err_check(job) if job['state'] == 'error': return print 'FINDING SIMILARITY' print 'min_post set to %s' % job['min_post'] hash_clust = HashtagClusters(float(job['min_post']), job['result_url'], job['start_time_ms']) query_params = [{ "query_type": "between", "property_name": "timestamp_ms", "query_value": [job['start_time_ms'], job['end_time_ms']] }, { "query_type": "where", "property_name": "featurizer", "query_value": "hashtag" }, { "query_type": "neq", "property_name": "hashtags", "query_value": "null" }] if 'lang' in job: query_params.append({ "query_type": "where", "property_name": "lang", "query_value": job['lang'] }) loopy = Loopy(job['query_url'], query_params) if loopy.result_count == 0: print "No data to process" job['data'] = [] job['error'] = "No data found to process." job['state'] = 'error' return while True: print "Scrolling...{}".format(loopy.total_returned) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: hash_clust.process_vector(doc['id'], doc['post_id'], doc['hashtags']) print 'FINISHED SIMILARITY PROCESSING' for k, v in hash_clust.get_clusters().iteritems(): cluster = { 'id': str(uuid.uuid4()), 'term': k, 'similar_ids': v['similar_ids'], 'similar_post_ids': v['similar_post_ids'], 'job_monitor_id': job['job_id'], 'start_time_ms': job['start_time_ms'], 'end_time_ms': job['end_time_ms'], 'stats': v['stats'], 'data_type': 'hashtag' } try: loopy.post_result(job['result_url'], cluster) except Exception as e: # TODO: we should set data = None when error. job['data'] = [] job['state'] = 'error' job['error'] = e break else: # no errors job['data'] = hash_clust.to_json() job['state'] = 'processed'