def process_message(key, job): err_check(job) if job['state'] == 'error': return api_root = job['api_root'] ts_end = job['end_time'] if api_root[-1] != '/': api_root += '/' job['api_root'] = api_root query_params = [{ "query_type": "where", "property_name": "end_time_ms", "query_value": ts_end }] com = Louvaine(api_root, '{}geocoder/forward-geo'.format(api_root)) nodes_to_lookup = set() nodes_to_add = list() edges_to_add = list() invalid_nodes = set() edges_to_remove = list() lp_e = Loopy('{}clusterLinks'.format(api_root), query_params, page_size=500) if lp_e.result_count == 0: print 'No data to process' job['data'] = [] job['error'] = 'No data found to process.' job['state'] = 'error' return print "getting cluster links" while True: page = lp_e.get_next_page() if page is None: break for doc in page: nodes_to_lookup.add(doc["target"]) nodes_to_lookup.add(doc["source"]) edges_to_add.append(doc) print "getting node data" for node_id in nodes_to_lookup: clust_url = "{}{}{}".format(api_root, "postsClusters/", node_id) node = Loopy.get(clust_url) if 'stats' in node: if node['stats']['is_unlikely'] == 0: invalid_nodes.add(node_id) continue nodes_to_add.append(node) print "pruning invalid node edges" for node_id in invalid_nodes: for edge in edges_to_add: if edge['target'] == node_id or edge['source'] == node_id: edges_to_remove.append(edge) for invalid_edge in edges_to_remove: if invalid_edge in edges_to_add: edges_to_add.remove(invalid_edge) print "adding edges to louvaine" for edge in edges_to_add: com.add_edge(edge) print "adding nodes to louvaine" for node in nodes_to_add: com.add_node(node) invalid_nodes.clear() nodes_to_lookup.clear() del nodes_to_add del edges_to_add del edges_to_remove print "Finding communities from {} nodes and {} edges.".format( len(com.graph.nodes()), len(com.graph.edges())) l_com = save_communities(com, job) if 'kafka_url' in job and 'kafka_topic' in job: kafka_url = job['kafka_url'] kafka_topic = job['kafka_topic'] print "Sending events to kafka" print "kafka_url" print kafka_url print "kafka_topic" print kafka_topic from event_to_kafka import stream_events stream_events(l_com.values(), kafka_url, kafka_topic) job['data'] = json.dumps({}) # no need to save anything to job job['state'] = 'processed'
from event_to_kafka import stream_events from loopy import Loopy import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("api_root", type=str, help="API root URL") parser.add_argument("-kafka_url", type=str, help="If writing events to kafka, specify url (default=None)", default=None) parser.add_argument("-kafka_topic", type=str, help="If writing event to kafka, specify topic (default=None)", default=None) parser.add_argument("--debug", help="Switch on for debugging", action='store_true') args = parser.parse_args() print "Debug:", args.debug print "Requesting Events" print "api_root = {}".format(args.api_root) r = Loopy.get("{}/api/events".format(args.api_root)) stream_events(r, args.kafka_url, args.kafka_topic, debug=args.debug)
def process_message(key,job): err_check(job) if job['state'] == 'error': return api_root = job['api_root'] ts_end = job['end_time'] geo_threshold = 5.0 if 'geo_threshold' not in job else float(job['geo_threshold']) if api_root[-1] != '/': api_root += '/' job['api_root'] = api_root query_params = [{ "query_type": "where", "property_name": "end_time_ms", "query_value": ts_end }] com = Louvaine(api_root, '{}geocoder/forward-geo'.format(api_root), geo_threshold) nodes_to_lookup = set() nodes_to_add = list() edges_to_add = list() invalid_nodes = set() edges_to_remove = list() lp_e = Loopy('{}clusterLinks'.format(api_root), query_params, page_size=500) if lp_e.result_count == 0: print 'No data to process' job['data'] = [] job['error'] = 'No data found to process.' job['state'] = 'error' return print "getting cluster links" while True: page = lp_e.get_next_page() if page is None: break for doc in page: nodes_to_lookup.add(doc["target"]) nodes_to_lookup.add(doc["source"]) edges_to_add.append(doc) print "getting node data" for node_id in nodes_to_lookup: clust_url = "{}{}{}".format(api_root, "postsClusters/", node_id) node = Loopy.get(clust_url) if 'stats' in node: if node['stats']['is_unlikely'] == 0: invalid_nodes.add(node_id) continue nodes_to_add.append(node) print "pruning invalid node edges" for node_id in invalid_nodes: for edge in edges_to_add: if edge['target'] == node_id or edge['source'] == node_id: edges_to_remove.append(edge) for invalid_edge in edges_to_remove: if invalid_edge in edges_to_add: edges_to_add.remove(invalid_edge) print "adding edges to louvaine" for edge in edges_to_add: com.add_edge(edge) print "adding nodes to louvaine" for node in nodes_to_add: com.add_node(node) invalid_nodes.clear() nodes_to_lookup.clear() del nodes_to_add del edges_to_add del edges_to_remove print "Finding communities from {} nodes and {} edges.".format(len(com.graph.nodes()), len(com.graph.edges())) l_com = save_communities(com, job) if 'kafka_url' in job and 'kafka_topic' in job: kafka_url = job['kafka_url'] kafka_topic = job['kafka_topic'] print "Sending events to kafka" print "kafka_url" print kafka_url print "kafka_topic" print kafka_topic from event_to_kafka import stream_events stream_events(l_com.values(), job) job['data'] = json.dumps({}) # no need to save anything to job job['state'] = 'processed'
from event_to_kafka import stream_events from loopy import Loopy import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("api_root", type=str, help="API root URL") parser.add_argument( "-kafka_url", type=str, help="If writing events to kafka, specify url (default=None)", default=None) parser.add_argument( "-kafka_topic", type=str, help="If writing event to kafka, specify topic (default=None)", default=None) parser.add_argument("--debug", help="Switch on for debugging", action='store_true') args = parser.parse_args() print "Debug:", args.debug print "Requesting Events" print "api_root = {}".format(args.api_root) r = Loopy.get("{}/api/events".format(args.api_root)) stream_events(r, args.kafka_url, args.kafka_topic, debug=args.debug)