示例#1
0
def process_message(key, job):
    err_check(job)
    if job['state'] == 'error':
        return

    api_root = job['api_root']
    ts_end = job['end_time']

    if api_root[-1] != '/': api_root += '/'
    job['api_root'] = api_root

    query_params = [{
        "query_type": "where",
        "property_name": "end_time_ms",
        "query_value": ts_end
    }]
    com = Louvaine(api_root, '{}geocoder/forward-geo'.format(api_root))

    nodes_to_lookup = set()
    nodes_to_add = list()
    edges_to_add = list()
    invalid_nodes = set()
    edges_to_remove = list()

    lp_e = Loopy('{}clusterLinks'.format(api_root),
                 query_params,
                 page_size=500)

    if lp_e.result_count == 0:
        print 'No data to process'
        job['data'] = []
        job['error'] = 'No data found to process.'
        job['state'] = 'error'
        return

    print "getting cluster links"
    while True:
        page = lp_e.get_next_page()
        if page is None:
            break
        for doc in page:
            nodes_to_lookup.add(doc["target"])
            nodes_to_lookup.add(doc["source"])
            edges_to_add.append(doc)

    print "getting node data"
    for node_id in nodes_to_lookup:
        clust_url = "{}{}{}".format(api_root, "postsClusters/", node_id)
        node = Loopy.get(clust_url)
        if 'stats' in node:
            if node['stats']['is_unlikely'] == 0:
                invalid_nodes.add(node_id)
                continue
        nodes_to_add.append(node)

    print "pruning invalid node edges"
    for node_id in invalid_nodes:
        for edge in edges_to_add:
            if edge['target'] == node_id or edge['source'] == node_id:
                edges_to_remove.append(edge)
    for invalid_edge in edges_to_remove:
        if invalid_edge in edges_to_add:
            edges_to_add.remove(invalid_edge)

    print "adding edges to louvaine"
    for edge in edges_to_add:
        com.add_edge(edge)

    print "adding nodes to louvaine"
    for node in nodes_to_add:
        com.add_node(node)

    invalid_nodes.clear()
    nodes_to_lookup.clear()
    del nodes_to_add
    del edges_to_add
    del edges_to_remove

    print "Finding communities from {} nodes and {} edges.".format(
        len(com.graph.nodes()), len(com.graph.edges()))
    l_com = save_communities(com, job)
    if 'kafka_url' in job and 'kafka_topic' in job:
        kafka_url = job['kafka_url']
        kafka_topic = job['kafka_topic']
        print "Sending events to kafka"
        print "kafka_url"
        print kafka_url
        print "kafka_topic"
        print kafka_topic
        from event_to_kafka import stream_events
        stream_events(l_com.values(), kafka_url, kafka_topic)

    job['data'] = json.dumps({})  # no need to save anything to job
    job['state'] = 'processed'
示例#2
0
from event_to_kafka import stream_events
from loopy import Loopy
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("api_root", type=str, help="API root URL")
    parser.add_argument("-kafka_url", type=str, help="If writing events to kafka, specify url (default=None)", default=None)
    parser.add_argument("-kafka_topic", type=str, help="If writing event to kafka, specify topic (default=None)", default=None)
    parser.add_argument("--debug", help="Switch on for debugging", action='store_true')
    args = parser.parse_args()

    print "Debug:", args.debug

    print "Requesting Events"
    print "api_root = {}".format(args.api_root)
    r = Loopy.get("{}/api/events".format(args.api_root))

    stream_events(r, args.kafka_url, args.kafka_topic, debug=args.debug)

示例#3
0
文件: main.py 项目: Sotera/watchman
def process_message(key,job):
    err_check(job)
    if job['state'] == 'error':
        return

    api_root = job['api_root']
    ts_end = job['end_time']

    geo_threshold = 5.0 if 'geo_threshold' not in job else float(job['geo_threshold'])

    if api_root[-1] != '/': api_root += '/'
    job['api_root'] = api_root

    query_params = [{
        "query_type": "where",
        "property_name": "end_time_ms",
        "query_value": ts_end
    }]
    com = Louvaine(api_root,
       '{}geocoder/forward-geo'.format(api_root), geo_threshold)

    nodes_to_lookup = set()
    nodes_to_add = list()
    edges_to_add = list()
    invalid_nodes = set()
    edges_to_remove = list()

    lp_e = Loopy('{}clusterLinks'.format(api_root), query_params, page_size=500)

    if lp_e.result_count == 0:
        print 'No data to process'
        job['data'] = []
        job['error'] = 'No data found to process.'
        job['state'] = 'error'
        return

    print "getting cluster links"
    while True:
        page = lp_e.get_next_page()
        if page is None:
            break
        for doc in page:
            nodes_to_lookup.add(doc["target"])
            nodes_to_lookup.add(doc["source"])
            edges_to_add.append(doc)

    print "getting node data"
    for node_id in nodes_to_lookup:
        clust_url = "{}{}{}".format(api_root, "postsClusters/", node_id)
        node = Loopy.get(clust_url)
        if 'stats' in node:
            if node['stats']['is_unlikely'] == 0:
                invalid_nodes.add(node_id)
                continue
        nodes_to_add.append(node)

    print "pruning invalid node edges"
    for node_id in invalid_nodes:
        for edge in edges_to_add:
            if edge['target'] == node_id or edge['source'] == node_id:
                edges_to_remove.append(edge)
    for invalid_edge in edges_to_remove:
        if invalid_edge in edges_to_add:
            edges_to_add.remove(invalid_edge)

    print "adding edges to louvaine"
    for edge in edges_to_add:
        com.add_edge(edge)

    print "adding nodes to louvaine"
    for node in nodes_to_add:
        com.add_node(node)

    invalid_nodes.clear()
    nodes_to_lookup.clear()
    del nodes_to_add
    del edges_to_add
    del edges_to_remove

    print "Finding communities from {} nodes and {} edges.".format(len(com.graph.nodes()), len(com.graph.edges()))
    l_com = save_communities(com, job)
    if 'kafka_url' in job and 'kafka_topic' in job:
        kafka_url = job['kafka_url']
        kafka_topic = job['kafka_topic']
        print "Sending events to kafka"
        print "kafka_url"
        print kafka_url
        print "kafka_topic"
        print kafka_topic
        from event_to_kafka import stream_events
        stream_events(l_com.values(), job)

    job['data'] = json.dumps({})  # no need to save anything to job
    job['state'] = 'processed'
示例#4
0
from event_to_kafka import stream_events
from loopy import Loopy
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("api_root", type=str, help="API root URL")
    parser.add_argument(
        "-kafka_url",
        type=str,
        help="If writing events to kafka, specify url (default=None)",
        default=None)
    parser.add_argument(
        "-kafka_topic",
        type=str,
        help="If writing event to kafka, specify topic (default=None)",
        default=None)
    parser.add_argument("--debug",
                        help="Switch on for debugging",
                        action='store_true')
    args = parser.parse_args()

    print "Debug:", args.debug

    print "Requesting Events"
    print "api_root = {}".format(args.api_root)
    r = Loopy.get("{}/api/events".format(args.api_root))

    stream_events(r, args.kafka_url, args.kafka_topic, debug=args.debug)