def get_priors(self, term): q_start_time = self.start_ms - self.prior_ms term = unicode(term) query_params = [{ "query_type":"where", "property_name":"term", "query_value": urllib.quote(term.encode('utf-8'), ':/') }, { "query_type":"where", "property_name":"data_type", "query_value":"hashtag" }, { "query_type":"between", "property_name":"end_time_ms", "query_value":[q_start_time, self.start_ms] }] lp = Loopy(self.url, query_params) #Default parameters are slight to favor real data alpha = 0.00001 beta = 1 while True: page = lp.get_next_page() if page is None: break for doc in page: alpha += len(doc['similar_post_ids']) beta += doc['stats']['total_posts'] return (alpha, beta)
def get_img_sum(self, cluster): n_posts = len(cluster['similar_post_ids']) l_sample = cluster['similar_post_ids'] if n_posts > 100: l_sample = sample(cluster['similar_post_ids'], 100) imgs = set() #TODO: fix query type once S.L. is fixed for id in l_sample: query_params = [{ "query_type": "between", "property_name": "post_id", "query_value": [id, id] }] lp = Loopy(self.url + 'socialMediaPosts', query_params) page = lp.get_next_page() if page is None: continue for doc in page: if 'primary_image_url' not in doc: continue imgs.add(doc['primary_image_url']) break return imgs
def process_message(key, job): # get features: print 'FINDING SIMILARITY' # do the work to find similarity error = validate_job(job) if error is not None: print "Error in Job : {}".format(error) job['data'] = [] job['error'] = error job['state'] = 'error' return image_similarity = ImageSimilarity(float(job['similarity_threshold']), job['start_time_ms'], job['end_time_ms'], job['similarity_method']) query_params = [{ "query_type": "between", "property_name": "timestamp_ms", "query_value": [job['start_time_ms'], job['end_time_ms']] }] if 'lang' in job.keys(): query_params.append({ "query_type": "where", "property_name": "lang", "query_value": job['lang'] }) loopy = Loopy(job['query_url'], query_params) if loopy.result_count == 0: print "No data to process" job['data'] = [] job['error'] = "No data found to process." job['state'] = 'error' return while True: print "Scrolling...{}".format(loopy.current_page) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: if job['data_type'] == "text" and 'text_features' in doc and 'id' in doc and \ len(doc['text_features']) > 0: image_similarity.process_vector(doc['id'], doc['text_features']) continue if job['data_type'] == "image" and 'image_features' in doc and 'id' in doc and \ len(doc['image_features']) > 0: image_similarity.process_vector(doc['id'], doc['image_features']) clusters = image_similarity.get_clusters() print 'FINISHED SIMILARITY PROCESSING: found {} clusters'.format(len(clusters)) for cluster in clusters: cluster['job_monitor_id'] = job['job_id'] loopy.post_result(job['result_url'], cluster) job['data'] = image_similarity.to_json() job['state'] = 'processed'
def process_message(key, job): if 'type' in job and job['type'] == 'featurizer': job['state'] = 'processed' return print 'Checking Parameters' err_check(job) if job['state'] == 'error': return print 'FINDING SIMILARITY' hash_clust = HashtagClusters(float(job['min_post'])) query_params = [{ "query_type": "between", "property_name": "timestamp_ms", "query_value": [job['start_time_ms'], job['end_time_ms']] }, { "query_type": "neq", "property_name": "hashtags", "query_value": "null" }] if 'lang' in job.keys(): query_params.append({ "query_type": "where", "property_name": "lang", "query_value": job['lang'] }) loopy = Loopy(job['query_url'], query_params) if loopy.result_count == 0: print "No data to process" job['data'] = [] job['error'] = "No data found to process." job['state'] = 'error' return while True: print "Scrolling...{}".format(loopy.current_page) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: hash_clust.process_vector(doc['id'], doc['hashtags']) print 'FINISHED SIMILARITY PROCESSING' for k, v in hash_clust.hash_groups.iteritems(): cluster = {} cluster['term'] = k cluster['post_ids'] = v cluster['job_monitor_id'] = job['job_id'] loopy.post_result(job['result_url'], cluster) job['data'] = hash_clust.to_json() job['state'] = 'processed'
def match_and_create_event(com, job): print 'Matching community: ', com['keywords'], com['hashtags'] events_url = '{}events'.format(job['api_root']) # get prior events: end_time == job.start_time - 1ms query_params = [{ 'query_type': 'where', 'property_name': 'end_time_ms', 'query_value': int(job['start_time']) - 1 }] loopy = Loopy(events_url, query_params) # if no events in prior window, create new event if loopy.result_count == 0: print 'no prior event found' create_event(loopy, com) return matched_event, match_score = None, 0 while True: page = loopy.get_next_page() if page is None: break for event in page: # score this com against each event, eventually take highest score = dot_comparison(com, event) print 'score: {}'.format(score) if score > match_score: match_score = score matched_event = event # is score above threshold? # then add link to new event if match_score >= MIN_MATCH_SCORE: com['sibling_id'] = matched_event.id create_event(loopy, com)
def match_and_create_event(com, job): print 'Matching community: ', com['keywords'], com['hashtags'] events_url = '{}events'.format(job['api_root']) # get prior events: end_time == job.start_time - 1ms query_params = [{ 'query_type': 'where', 'property_name': 'end_time_ms', 'query_value': int(job['start_time']) - 1 }] loopy = Loopy(events_url, query_params) # if no events in prior window, create new event if loopy.result_count == 0: print 'no prior event found' create_event(loopy, com) return matched_event, match_score = None, 0 while True: page = loopy.get_next_page() if page is None: break for event in page: # score this com against each event, eventually take highest score = math_utils.dot_comparison(com, event) print 'score: {}'.format(score) if score > match_score: match_score = score matched_event = event # is score above threshold? # then add link to new event if match_score >= MIN_MATCH_SCORE: com['sibling_id'] = matched_event.id create_event(loopy, com)
def process_message(key, job): err_check(job) if job['state'] == 'error': return query_url = os.getenv('QUERY_URL', job['query_url']) result_url = os.getenv('RESULT_URL', job['result_url']) query_params = [{ 'query_type': 'where', 'property_name': 'end_time_ms', 'query_value': job['end_time_ms'] }] print 'BEGIN LINKING CLUSTERS' linker = ClusterLinker(job.get('min_overlap', 0.6)) loopy = Loopy(query_url, query_params) if loopy.result_count == 0: print 'No data to process' job['data'] = [] job['error'] = 'No data found to process.' job['state'] = 'error' return while True: print 'Scrolling...{}'.format(loopy.current_page) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: linker.add_cluster(doc) print 'FINISHED LINKING CLUSTERS' for link in linker.get_links(): loopy.post_result(result_url, link) job['data'] = json.dumps({}) # no need to save anything to job job['state'] = 'processed'
def process_message(key, job): err_check(job) if job['state'] == 'error': return api_root = job['api_root'] ts_end = job['end_time'] if api_root[-1] != '/': api_root += '/' job['api_root'] = api_root query_params = [{ "query_type": "where", "property_name": "end_time_ms", "query_value": ts_end }] com = Louvaine(api_root, '{}geocoder/forward-geo'.format(api_root)) nodes_to_lookup = set() nodes_to_add = list() edges_to_add = list() invalid_nodes = set() edges_to_remove = list() lp_e = Loopy('{}clusterLinks'.format(api_root), query_params, page_size=500) if lp_e.result_count == 0: print 'No data to process' job['data'] = [] job['error'] = 'No data found to process.' job['state'] = 'error' return print "getting cluster links" while True: page = lp_e.get_next_page() if page is None: break for doc in page: nodes_to_lookup.add(doc["target"]) nodes_to_lookup.add(doc["source"]) edges_to_add.append(doc) print "getting node data" for node_id in nodes_to_lookup: clust_url = "{}{}{}".format(api_root, "postsClusters/", node_id) node = Loopy.get(clust_url) if 'stats' in node: if node['stats']['is_unlikely'] == 0: invalid_nodes.add(node_id) continue nodes_to_add.append(node) print "pruning invalid node edges" for node_id in invalid_nodes: for edge in edges_to_add: if edge['target'] == node_id or edge['source'] == node_id: edges_to_remove.append(edge) for invalid_edge in edges_to_remove: if invalid_edge in edges_to_add: edges_to_add.remove(invalid_edge) print "adding edges to louvaine" for edge in edges_to_add: com.add_edge(edge) print "adding nodes to louvaine" for node in nodes_to_add: com.add_node(node) invalid_nodes.clear() nodes_to_lookup.clear() del nodes_to_add del edges_to_add del edges_to_remove print "Finding communities from {} nodes and {} edges.".format( len(com.graph.nodes()), len(com.graph.edges())) l_com = save_communities(com, job) if 'kafka_url' in job and 'kafka_topic' in job: kafka_url = job['kafka_url'] kafka_topic = job['kafka_topic'] print "Sending events to kafka" print "kafka_url" print kafka_url print "kafka_topic" print kafka_topic from event_to_kafka import stream_events stream_events(l_com.values(), kafka_url, kafka_topic) job['data'] = json.dumps({}) # no need to save anything to job job['state'] = 'processed'
def get_text_sum(self, cluster, r_o): n_posts = len(cluster['similar_post_ids']) l_sample = cluster['similar_post_ids'] if n_posts > 30: l_sample = sample(cluster['similar_post_ids'], 30) n_posts = 30 words = {} places = [] websites = set([]) r_o["campaigns"]["total"] += n_posts #TODO: fix query type once S.L. is fixed query_params = [{ "query_type": "inq", "property_name": "post_id", "query_value": l_sample }] lp = Loopy(self.url + 'socialMediaPosts', query_params) page = lp.get_next_page() if page is None: return for doc in page: if doc['featurizer'] != cluster['data_type']: continue if 'campaigns' in doc: for cam in doc['campaigns']: if cam in r_o["campaigns"]["ids"]: r_o["campaigns"]["ids"][cam] += 1 else: r_o["campaigns"]["ids"][cam] = 1 locs = self.ent_ext.extract(doc['text'], tag='I-LOC') for loc in locs: print 'Location:', loc.encode('utf-8') try: geos = Loopy.post(self.geo_url, json={'address': loc}) for place in geos: places.append(place) break except Exception as e: print "error getting locations from geocoder...continuing.", e traceback.print_exc() tokens = [ w for w in self.sf.pres_tokenize(doc['text'], doc['lang']) if w not in stop_list ] for word in tokens: if word[0] == '#': continue if word[0] == '@': continue if word[:4] == 'http': websites.add(word) continue if word[:3] == 'www': websites.add('http://' + word) continue if word in words: words[word] += 1 else: words[word] = 1 for k, v in words.iteritems(): k = remove_punctuation(k) if v < 5: continue if v in r_o['keywords']: r_o['keywords'][k] += v else: r_o['keywords'][k] = v for place in places: if type(place) is not dict: continue place_name = '' weight = 0.0 if 'city' in place.keys(): place_name = place['city'] + ' ' weight += 1 if 'state' in place.keys(): place_name += place['state'] + ' ' weight += .1 if 'country' in place.keys(): place_name += ' ' + place['country'] + ' ' weight += .05 if place_name in r_o['location']: r_o['location'][place_name]['weight'] += weight else: r_o['location'][place_name] = { "type": "inferred point", "geo_type": "point", "coords": [{ "lat": place['latitude'], "lng": place['longitude'] }], "weight": weight } r_o['location'] = dict((k, v) for k, v in r_o['location'].items() if v['weight'] >= self.geo_threshold) for url in list(websites): r_o['urls'].add(url)
def process_message(key,job): err_check(job) if job['state'] == 'error': return api_root = job['api_root'] ts_end = job['end_time'] geo_threshold = 5.0 if 'geo_threshold' not in job else float(job['geo_threshold']) if api_root[-1] != '/': api_root += '/' job['api_root'] = api_root query_params = [{ "query_type": "where", "property_name": "end_time_ms", "query_value": ts_end }] com = Louvaine(api_root, '{}geocoder/forward-geo'.format(api_root), geo_threshold) nodes_to_lookup = set() nodes_to_add = list() edges_to_add = list() invalid_nodes = set() edges_to_remove = list() lp_e = Loopy('{}clusterLinks'.format(api_root), query_params, page_size=500) if lp_e.result_count == 0: print 'No data to process' job['data'] = [] job['error'] = 'No data found to process.' job['state'] = 'error' return print "getting cluster links" while True: page = lp_e.get_next_page() if page is None: break for doc in page: nodes_to_lookup.add(doc["target"]) nodes_to_lookup.add(doc["source"]) edges_to_add.append(doc) print "getting node data" for node_id in nodes_to_lookup: clust_url = "{}{}{}".format(api_root, "postsClusters/", node_id) node = Loopy.get(clust_url) if 'stats' in node: if node['stats']['is_unlikely'] == 0: invalid_nodes.add(node_id) continue nodes_to_add.append(node) print "pruning invalid node edges" for node_id in invalid_nodes: for edge in edges_to_add: if edge['target'] == node_id or edge['source'] == node_id: edges_to_remove.append(edge) for invalid_edge in edges_to_remove: if invalid_edge in edges_to_add: edges_to_add.remove(invalid_edge) print "adding edges to louvaine" for edge in edges_to_add: com.add_edge(edge) print "adding nodes to louvaine" for node in nodes_to_add: com.add_node(node) invalid_nodes.clear() nodes_to_lookup.clear() del nodes_to_add del edges_to_add del edges_to_remove print "Finding communities from {} nodes and {} edges.".format(len(com.graph.nodes()), len(com.graph.edges())) l_com = save_communities(com, job) if 'kafka_url' in job and 'kafka_topic' in job: kafka_url = job['kafka_url'] kafka_topic = job['kafka_topic'] print "Sending events to kafka" print "kafka_url" print kafka_url print "kafka_topic" print kafka_topic from event_to_kafka import stream_events stream_events(l_com.values(), job) job['data'] = json.dumps({}) # no need to save anything to job job['state'] = 'processed'
def process_message(key, job): print 'FINDING SIMILARITY' error = validate_job(job) if error: print 'Error in Job : {}'.format(error) job['data'] = [] job['error'] = error job['state'] = 'error' return feature_similarity = FeatureSimilarity(float(job['similarity_threshold']), job['start_time_ms'], job['end_time_ms']) query_params = [{ 'query_type': 'between', 'property_name': 'timestamp_ms', 'query_value': [job['start_time_ms'], job['end_time_ms']], 'featurizer': job['data_type'] }] if 'lang' in job: query_params.append({ 'query_type': 'where', 'property_name': 'lang', 'query_value': job['lang'] }) loopy = Loopy(job['query_url'], query_params) if loopy.result_count == 0: print 'No data to process' job['data'] = [] job['error'] = 'No data found to process.' job['state'] = 'error' return while True: print 'Scrolling...{}'.format(loopy.total_returned) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: if job['data_type'] == 'text' and 'text_features' in doc and 'id' in doc and \ len(doc['text_features']) > 0: feature_similarity.process_vector(doc['id'], doc['post_id'], doc['text_features']) continue if job['data_type'] == 'image' and 'image_features' in doc and 'id' in doc and \ len(doc['image_features']) > 0 and 'primary_image_url' in doc and \ doc['primary_image_url']: feature_similarity.process_vector(doc['id'], doc['post_id'], doc['image_features'], doc['primary_image_url']) clusters = feature_similarity.get_clusters() for cluster in clusters: cluster['job_monitor_id'] = job['job_id'] cluster['data_type'] = job['data_type'] loopy.post_result(job['result_url'], cluster) job['data'] = feature_similarity.to_json() job['state'] = 'processed' print 'FINISHED SIMILARITY PROCESSING: found {} clusters'.format(len(clusters))
def process_message(key, job): print "absentfriends/main.py:process_message" + repr((key, job)) # if type == 'featurizer', immediately process and return b/c domains # are not featurized. allows system to continue with clustering process. if job.get('type') == 'featurizer': job['state'] = 'processed' job['data'] = [] return err_check(job) if job['state'] == 'error': return query_url = os.getenv('QUERY_URL', job['query_url']) result_url = os.getenv('RESULT_URL', job['result_url']) print 'FINDING SIMILARITY' print 'min_post set to %s' % job['min_post'] domain_clust = DomainClusters(float(job['min_post']), query_url, job['start_time_ms']) query_params = [{ "query_type": "between", "property_name": "timestamp_ms", "query_value": [job['start_time_ms'], job['end_time_ms']] }, { "query_type": "where", "property_name": "featurizer", "query_value": "domain" }] loopy = Loopy(query_url, query_params) if loopy.result_count == 0: print "No data to process" job['data'] = [] job['error'] = "No data found to process." job['state'] = 'error' return while True: print "Scrolling...{}".format(loopy.total_returned) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: domains = get_domains(doc['image_urls']) if len(domains) > 0: domain_clust.process_vector(doc['id'], doc['post_id'], domains) if int(os.getenv('TRUNCATE_POSTS') or 0): print 'Truncating posts...' print truncate_posts(feature_similarity.get_clusters_to_delete(), loopy) else: print 'Skipping truncate posts because TRUNCATE_POSTS env var is not set...' print 'FINISHED SIMILARITY PROCESSING' for k, v in domain_clust.get_clusters().iteritems(): cluster = {} cluster['id'] = str(uuid.uuid4()) cluster['term'] = k cluster['similar_ids'] = v['similar_ids'] cluster['similar_post_ids'] = v['similar_post_ids'] cluster['job_monitor_id'] = job['job_id'] cluster['start_time_ms'] = job['start_time_ms'] cluster['end_time_ms'] = job['end_time_ms'] cluster['stats'] = v['stats'] cluster['data_type'] = 'domain' try: loopy.post_result(result_url, cluster) except Exception as e: # TODO: we should set data = None when error. job['data'] = [] job['state'] = 'error' job['error'] = e break else: # no errors job['data'] = domain_clust.to_json() job['state'] = 'processed'
sys.path.append('/Users/jgartner/projects/xdata/QCR/watchman/services/dr-manhattan/') from louvaine import Louvaine query_params = [{ "query_type": "between", "property_name": "end_time_ms", "query_value": [1477580423000, 1477780423000] }] lp_n = Loopy('http://localhost:3003/api/postsClusters', query_params) com = Louvaine('http://localhost:3003/api/', 'http://54.89.54.199:3003/api/extract/entities', 'http://54.89.54.199:3003/api/geocoder/forward-geo') while True: page = lp_n.get_next_page() if page is None: break for doc in page: com.add_node(doc) lp_e = Loopy('http://localhost:3003/api/clusterLinks', query_params) while True: page = lp_e.get_next_page() if page is None: break for doc in page: com.add_edge(doc) com.save_communities() #com.get_communities()
def process_message(key, job): err_check(job) if job['state'] == 'error': return print 'FINDING SIMILARITY' print 'min_post set to %s' % job['min_post'] hash_clust = HashtagClusters(float(job['min_post']), job['result_url'], job['start_time_ms']) query_params = [{ "query_type": "between", "property_name": "timestamp_ms", "query_value": [job['start_time_ms'], job['end_time_ms']] }, { "query_type": "where", "property_name": "featurizer", "query_value": "hashtag" }, { "query_type": "neq", "property_name": "hashtags", "query_value": "null" }] if 'lang' in job: query_params.append({ "query_type": "where", "property_name": "lang", "query_value": job['lang'] }) loopy = Loopy(job['query_url'], query_params) if loopy.result_count == 0: print "No data to process" job['data'] = [] job['error'] = "No data found to process." job['state'] = 'error' return while True: print "Scrolling...{}".format(loopy.total_returned) page = loopy.get_next_page() if page is None: break # Do something with the obtained page for doc in page: hash_clust.process_vector(doc['id'], doc['post_id'], doc['hashtags']) print 'FINISHED SIMILARITY PROCESSING' for k, v in hash_clust.get_clusters().iteritems(): cluster = { 'id': str(uuid.uuid4()), 'term': k, 'similar_ids': v['similar_ids'], 'similar_post_ids': v['similar_post_ids'], 'job_monitor_id': job['job_id'], 'start_time_ms': job['start_time_ms'], 'end_time_ms': job['end_time_ms'], 'stats': v['stats'], 'data_type': 'hashtag' } try: loopy.post_result(job['result_url'], cluster) except Exception as e: # TODO: we should set data = None when error. job['data'] = [] job['state'] = 'error' job['error'] = e break else: # no errors job['data'] = hash_clust.to_json() job['state'] = 'processed'