示例#1
0
    def get_priors(self, term):
        q_start_time = self.start_ms - self.prior_ms

        term = unicode(term)
        query_params = [{
            "query_type":"where",
            "property_name":"term",
            "query_value": urllib.quote(term.encode('utf-8'), ':/')
        },
        {
            "query_type":"where",
            "property_name":"data_type",
            "query_value":"hashtag"
        },
        {
            "query_type":"between",
            "property_name":"end_time_ms",
            "query_value":[q_start_time, self.start_ms]
        }]
        lp = Loopy(self.url, query_params)
        #Default parameters are slight to favor real data
        alpha = 0.00001
        beta = 1
        while True:
            page = lp.get_next_page()
            if page is None:
                break
            for doc in page:
                alpha += len(doc['similar_post_ids'])
                beta += doc['stats']['total_posts']
        return (alpha, beta)
示例#2
0
    def get_img_sum(self, cluster):
        n_posts = len(cluster['similar_post_ids'])
        l_sample = cluster['similar_post_ids']
        if n_posts > 100:
            l_sample = sample(cluster['similar_post_ids'], 100)

        imgs = set()

        #TODO: fix query type once S.L. is fixed
        for id in l_sample:
            query_params = [{
                "query_type": "between",
                "property_name": "post_id",
                "query_value": [id, id]
            }]
            lp = Loopy(self.url + 'socialMediaPosts', query_params)
            page = lp.get_next_page()
            if page is None:
                continue
            for doc in page:
                if 'primary_image_url' not in doc:
                    continue
                imgs.add(doc['primary_image_url'])
                break

        return imgs
示例#3
0
def process_message(key, job):
    # get features:
    print 'FINDING SIMILARITY'
    # do the work to find similarity
    error = validate_job(job)
    if error is not None:
        print "Error in Job : {}".format(error)
        job['data'] = []
        job['error'] = error
        job['state'] = 'error'
        return

    image_similarity = ImageSimilarity(float(job['similarity_threshold']), job['start_time_ms'], job['end_time_ms'],
                                       job['similarity_method'])
    query_params = [{
        "query_type": "between",
        "property_name": "timestamp_ms",
        "query_value": [job['start_time_ms'], job['end_time_ms']]
    }]
    if 'lang' in job.keys():
        query_params.append({
            "query_type": "where",
            "property_name": "lang",
            "query_value": job['lang']
        })
    loopy = Loopy(job['query_url'], query_params)

    if loopy.result_count == 0:
        print "No data to process"
        job['data'] = []
        job['error'] = "No data found to process."
        job['state'] = 'error'
        return

    while True:
        print "Scrolling...{}".format(loopy.current_page)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            if job['data_type'] == "text" and 'text_features' in doc and 'id' in doc and \
                    len(doc['text_features']) > 0:
                image_similarity.process_vector(doc['id'], doc['text_features'])
                continue
            if job['data_type'] == "image" and 'image_features' in doc and 'id' in doc and \
                    len(doc['image_features']) > 0:
                image_similarity.process_vector(doc['id'], doc['image_features'])

    clusters = image_similarity.get_clusters()

    print 'FINISHED SIMILARITY PROCESSING: found {} clusters'.format(len(clusters))
    for cluster in clusters:
        cluster['job_monitor_id'] = job['job_id']
        loopy.post_result(job['result_url'], cluster)
    job['data'] = image_similarity.to_json()
    job['state'] = 'processed'
示例#4
0
def process_message(key, job):
    if 'type' in job and job['type'] == 'featurizer':
        job['state'] = 'processed'
        return

    print 'Checking Parameters'
    err_check(job)
    if job['state'] == 'error':
        return

    print 'FINDING SIMILARITY'
    hash_clust = HashtagClusters(float(job['min_post']))

    query_params = [{
        "query_type": "between",
        "property_name": "timestamp_ms",
        "query_value": [job['start_time_ms'], job['end_time_ms']]
    }, {
        "query_type": "neq",
        "property_name": "hashtags",
        "query_value": "null"
    }]

    if 'lang' in job.keys():
        query_params.append({
            "query_type": "where",
            "property_name": "lang",
            "query_value": job['lang']
        })
    loopy = Loopy(job['query_url'], query_params)

    if loopy.result_count == 0:
        print "No data to process"
        job['data'] = []
        job['error'] = "No data found to process."
        job['state'] = 'error'
        return

    while True:
        print "Scrolling...{}".format(loopy.current_page)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            hash_clust.process_vector(doc['id'], doc['hashtags'])

    print 'FINISHED SIMILARITY PROCESSING'
    for k, v in hash_clust.hash_groups.iteritems():
        cluster = {}
        cluster['term'] = k
        cluster['post_ids'] = v
        cluster['job_monitor_id'] = job['job_id']
        loopy.post_result(job['result_url'], cluster)

    job['data'] = hash_clust.to_json()
    job['state'] = 'processed'
示例#5
0
def match_and_create_event(com, job):
    print 'Matching community: ', com['keywords'], com['hashtags']

    events_url = '{}events'.format(job['api_root'])

    # get prior events: end_time == job.start_time - 1ms
    query_params = [{
        'query_type': 'where',
        'property_name': 'end_time_ms',
        'query_value': int(job['start_time']) - 1
    }]

    loopy = Loopy(events_url, query_params)

    # if no events in prior window, create new event
    if loopy.result_count == 0:
        print 'no prior event found'
        create_event(loopy, com)
        return

    matched_event, match_score = None, 0

    while True:
        page = loopy.get_next_page()

        if page is None:
            break

        for event in page:
            # score this com against each event, eventually take highest
            score = dot_comparison(com, event)
            print 'score: {}'.format(score)
            if score > match_score:
                match_score = score
                matched_event = event

    # is score above threshold?
    # then add link to new event
    if match_score >= MIN_MATCH_SCORE:
        com['sibling_id'] = matched_event.id

    create_event(loopy, com)
示例#6
0
def match_and_create_event(com, job):
    print 'Matching community: ', com['keywords'], com['hashtags']

    events_url = '{}events'.format(job['api_root'])

    # get prior events: end_time == job.start_time - 1ms
    query_params = [{
        'query_type': 'where',
        'property_name': 'end_time_ms',
        'query_value': int(job['start_time']) - 1
    }]

    loopy = Loopy(events_url, query_params)

    # if no events in prior window, create new event
    if loopy.result_count == 0:
        print 'no prior event found'
        create_event(loopy, com)
        return

    matched_event, match_score = None, 0

    while True:
        page = loopy.get_next_page()

        if page is None:
            break

        for event in page:
            # score this com against each event, eventually take highest
            score = math_utils.dot_comparison(com, event)
            print 'score: {}'.format(score)
            if score > match_score:
                match_score = score
                matched_event = event

    # is score above threshold?
    # then add link to new event
    if match_score >= MIN_MATCH_SCORE:
        com['sibling_id'] = matched_event.id

    create_event(loopy, com)
示例#7
0
def process_message(key, job):
    err_check(job)
    if job['state'] == 'error':
        return

    query_url = os.getenv('QUERY_URL', job['query_url'])
    result_url = os.getenv('RESULT_URL', job['result_url'])

    query_params = [{
        'query_type': 'where',
        'property_name': 'end_time_ms',
        'query_value': job['end_time_ms']
    }]

    print 'BEGIN LINKING CLUSTERS'
    linker = ClusterLinker(job.get('min_overlap', 0.6))
    loopy = Loopy(query_url, query_params)

    if loopy.result_count == 0:
        print 'No data to process'
        job['data'] = []
        job['error'] = 'No data found to process.'
        job['state'] = 'error'
        return

    while True:
        print 'Scrolling...{}'.format(loopy.current_page)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            linker.add_cluster(doc)

    print 'FINISHED LINKING CLUSTERS'
    for link in linker.get_links():
        loopy.post_result(result_url, link)

    job['data'] = json.dumps({})  # no need to save anything to job
    job['state'] = 'processed'
示例#8
0
def process_message(key, job):
    err_check(job)
    if job['state'] == 'error':
        return

    api_root = job['api_root']
    ts_end = job['end_time']

    if api_root[-1] != '/': api_root += '/'
    job['api_root'] = api_root

    query_params = [{
        "query_type": "where",
        "property_name": "end_time_ms",
        "query_value": ts_end
    }]
    com = Louvaine(api_root, '{}geocoder/forward-geo'.format(api_root))

    nodes_to_lookup = set()
    nodes_to_add = list()
    edges_to_add = list()
    invalid_nodes = set()
    edges_to_remove = list()

    lp_e = Loopy('{}clusterLinks'.format(api_root),
                 query_params,
                 page_size=500)

    if lp_e.result_count == 0:
        print 'No data to process'
        job['data'] = []
        job['error'] = 'No data found to process.'
        job['state'] = 'error'
        return

    print "getting cluster links"
    while True:
        page = lp_e.get_next_page()
        if page is None:
            break
        for doc in page:
            nodes_to_lookup.add(doc["target"])
            nodes_to_lookup.add(doc["source"])
            edges_to_add.append(doc)

    print "getting node data"
    for node_id in nodes_to_lookup:
        clust_url = "{}{}{}".format(api_root, "postsClusters/", node_id)
        node = Loopy.get(clust_url)
        if 'stats' in node:
            if node['stats']['is_unlikely'] == 0:
                invalid_nodes.add(node_id)
                continue
        nodes_to_add.append(node)

    print "pruning invalid node edges"
    for node_id in invalid_nodes:
        for edge in edges_to_add:
            if edge['target'] == node_id or edge['source'] == node_id:
                edges_to_remove.append(edge)
    for invalid_edge in edges_to_remove:
        if invalid_edge in edges_to_add:
            edges_to_add.remove(invalid_edge)

    print "adding edges to louvaine"
    for edge in edges_to_add:
        com.add_edge(edge)

    print "adding nodes to louvaine"
    for node in nodes_to_add:
        com.add_node(node)

    invalid_nodes.clear()
    nodes_to_lookup.clear()
    del nodes_to_add
    del edges_to_add
    del edges_to_remove

    print "Finding communities from {} nodes and {} edges.".format(
        len(com.graph.nodes()), len(com.graph.edges()))
    l_com = save_communities(com, job)
    if 'kafka_url' in job and 'kafka_topic' in job:
        kafka_url = job['kafka_url']
        kafka_topic = job['kafka_topic']
        print "Sending events to kafka"
        print "kafka_url"
        print kafka_url
        print "kafka_topic"
        print kafka_topic
        from event_to_kafka import stream_events
        stream_events(l_com.values(), kafka_url, kafka_topic)

    job['data'] = json.dumps({})  # no need to save anything to job
    job['state'] = 'processed'
示例#9
0
    def get_text_sum(self, cluster, r_o):
        n_posts = len(cluster['similar_post_ids'])
        l_sample = cluster['similar_post_ids']
        if n_posts > 30:
            l_sample = sample(cluster['similar_post_ids'], 30)
            n_posts = 30

        words = {}
        places = []
        websites = set([])
        r_o["campaigns"]["total"] += n_posts

        #TODO: fix query type once S.L. is fixed
        query_params = [{
            "query_type": "inq",
            "property_name": "post_id",
            "query_value": l_sample
        }]
        lp = Loopy(self.url + 'socialMediaPosts', query_params)
        page = lp.get_next_page()
        if page is None:
            return

        for doc in page:
            if doc['featurizer'] != cluster['data_type']:
                continue

            if 'campaigns' in doc:
                for cam in doc['campaigns']:
                    if cam in r_o["campaigns"]["ids"]:
                        r_o["campaigns"]["ids"][cam] += 1
                    else:
                        r_o["campaigns"]["ids"][cam] = 1

            locs = self.ent_ext.extract(doc['text'], tag='I-LOC')
            for loc in locs:
                print 'Location:', loc.encode('utf-8')
                try:
                    geos = Loopy.post(self.geo_url, json={'address': loc})
                    for place in geos:
                        places.append(place)
                        break
                except Exception as e:
                    print "error getting locations from geocoder...continuing.", e
                    traceback.print_exc()

            tokens = [
                w for w in self.sf.pres_tokenize(doc['text'], doc['lang'])
                if w not in stop_list
            ]
            for word in tokens:
                if word[0] == '#':
                    continue
                if word[0] == '@':
                    continue
                if word[:4] == 'http':
                    websites.add(word)
                    continue
                if word[:3] == 'www':
                    websites.add('http://' + word)
                    continue
                if word in words:
                    words[word] += 1
                else:
                    words[word] = 1

        for k, v in words.iteritems():
            k = remove_punctuation(k)
            if v < 5:
                continue
            if v in r_o['keywords']:
                r_o['keywords'][k] += v
            else:
                r_o['keywords'][k] = v

        for place in places:
            if type(place) is not dict:
                continue
            place_name = ''
            weight = 0.0
            if 'city' in place.keys():
                place_name = place['city'] + ' '
                weight += 1
            if 'state' in place.keys():
                place_name += place['state'] + ' '
                weight += .1
            if 'country' in place.keys():
                place_name += ' ' + place['country'] + ' '
                weight += .05
            if place_name in r_o['location']:
                r_o['location'][place_name]['weight'] += weight
            else:
                r_o['location'][place_name] = {
                    "type":
                    "inferred point",
                    "geo_type":
                    "point",
                    "coords": [{
                        "lat": place['latitude'],
                        "lng": place['longitude']
                    }],
                    "weight":
                    weight
                }

        r_o['location'] = dict((k, v) for k, v in r_o['location'].items()
                               if v['weight'] >= self.geo_threshold)

        for url in list(websites):
            r_o['urls'].add(url)
示例#10
0
文件: main.py 项目: Sotera/watchman
def process_message(key,job):
    err_check(job)
    if job['state'] == 'error':
        return

    api_root = job['api_root']
    ts_end = job['end_time']

    geo_threshold = 5.0 if 'geo_threshold' not in job else float(job['geo_threshold'])

    if api_root[-1] != '/': api_root += '/'
    job['api_root'] = api_root

    query_params = [{
        "query_type": "where",
        "property_name": "end_time_ms",
        "query_value": ts_end
    }]
    com = Louvaine(api_root,
       '{}geocoder/forward-geo'.format(api_root), geo_threshold)

    nodes_to_lookup = set()
    nodes_to_add = list()
    edges_to_add = list()
    invalid_nodes = set()
    edges_to_remove = list()

    lp_e = Loopy('{}clusterLinks'.format(api_root), query_params, page_size=500)

    if lp_e.result_count == 0:
        print 'No data to process'
        job['data'] = []
        job['error'] = 'No data found to process.'
        job['state'] = 'error'
        return

    print "getting cluster links"
    while True:
        page = lp_e.get_next_page()
        if page is None:
            break
        for doc in page:
            nodes_to_lookup.add(doc["target"])
            nodes_to_lookup.add(doc["source"])
            edges_to_add.append(doc)

    print "getting node data"
    for node_id in nodes_to_lookup:
        clust_url = "{}{}{}".format(api_root, "postsClusters/", node_id)
        node = Loopy.get(clust_url)
        if 'stats' in node:
            if node['stats']['is_unlikely'] == 0:
                invalid_nodes.add(node_id)
                continue
        nodes_to_add.append(node)

    print "pruning invalid node edges"
    for node_id in invalid_nodes:
        for edge in edges_to_add:
            if edge['target'] == node_id or edge['source'] == node_id:
                edges_to_remove.append(edge)
    for invalid_edge in edges_to_remove:
        if invalid_edge in edges_to_add:
            edges_to_add.remove(invalid_edge)

    print "adding edges to louvaine"
    for edge in edges_to_add:
        com.add_edge(edge)

    print "adding nodes to louvaine"
    for node in nodes_to_add:
        com.add_node(node)

    invalid_nodes.clear()
    nodes_to_lookup.clear()
    del nodes_to_add
    del edges_to_add
    del edges_to_remove

    print "Finding communities from {} nodes and {} edges.".format(len(com.graph.nodes()), len(com.graph.edges()))
    l_com = save_communities(com, job)
    if 'kafka_url' in job and 'kafka_topic' in job:
        kafka_url = job['kafka_url']
        kafka_topic = job['kafka_topic']
        print "Sending events to kafka"
        print "kafka_url"
        print kafka_url
        print "kafka_topic"
        print kafka_topic
        from event_to_kafka import stream_events
        stream_events(l_com.values(), job)

    job['data'] = json.dumps({})  # no need to save anything to job
    job['state'] = 'processed'
示例#11
0
def process_message(key, job):
    print 'FINDING SIMILARITY'
    error = validate_job(job)
    if error:
        print 'Error in Job : {}'.format(error)
        job['data'] = []
        job['error'] = error
        job['state'] = 'error'
        return

    feature_similarity = FeatureSimilarity(float(job['similarity_threshold']), job['start_time_ms'], job['end_time_ms'])
    query_params = [{
        'query_type': 'between',
        'property_name': 'timestamp_ms',
        'query_value': [job['start_time_ms'], job['end_time_ms']],
        'featurizer': job['data_type']
    }]
    if 'lang' in job:
        query_params.append({
            'query_type': 'where',
            'property_name': 'lang',
            'query_value': job['lang']
        })

    loopy = Loopy(job['query_url'], query_params)

    if loopy.result_count == 0:
        print 'No data to process'
        job['data'] = []
        job['error'] = 'No data found to process.'
        job['state'] = 'error'
        return

    while True:
        print 'Scrolling...{}'.format(loopy.total_returned)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            if job['data_type'] == 'text' and 'text_features' in doc and 'id' in doc and \
                    len(doc['text_features']) > 0:
                feature_similarity.process_vector(doc['id'], doc['post_id'], doc['text_features'])
                continue
            if job['data_type'] == 'image' and 'image_features' in doc and 'id' in doc and \
                    len(doc['image_features']) > 0 and 'primary_image_url' in doc and \
                    doc['primary_image_url']:
                feature_similarity.process_vector(doc['id'], doc['post_id'], doc['image_features'],
                                                  doc['primary_image_url'])

    clusters = feature_similarity.get_clusters()

    for cluster in clusters:
        cluster['job_monitor_id'] = job['job_id']
        cluster['data_type'] = job['data_type']
        loopy.post_result(job['result_url'], cluster)

    job['data'] = feature_similarity.to_json()
    job['state'] = 'processed'

    print 'FINISHED SIMILARITY PROCESSING: found {} clusters'.format(len(clusters))
示例#12
0
def process_message(key, job):
    print "absentfriends/main.py:process_message" + repr((key, job))
    # if type == 'featurizer', immediately process and return b/c domains
    # are not featurized. allows system to continue with clustering process.
    if job.get('type') == 'featurizer':
        job['state'] = 'processed'
        job['data'] = []
        return

    err_check(job)
    if job['state'] == 'error':
        return

    query_url = os.getenv('QUERY_URL', job['query_url'])
    result_url = os.getenv('RESULT_URL', job['result_url'])

    print 'FINDING SIMILARITY'
    print 'min_post set to %s' % job['min_post']
    domain_clust = DomainClusters(float(job['min_post']), query_url,
                                  job['start_time_ms'])

    query_params = [{
        "query_type": "between",
        "property_name": "timestamp_ms",
        "query_value": [job['start_time_ms'], job['end_time_ms']]
    }, {
        "query_type": "where",
        "property_name": "featurizer",
        "query_value": "domain"
    }]

    loopy = Loopy(query_url, query_params)

    if loopy.result_count == 0:
        print "No data to process"
        job['data'] = []
        job['error'] = "No data found to process."
        job['state'] = 'error'
        return

    while True:
        print "Scrolling...{}".format(loopy.total_returned)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            domains = get_domains(doc['image_urls'])
            if len(domains) > 0:
                domain_clust.process_vector(doc['id'], doc['post_id'], domains)

    if int(os.getenv('TRUNCATE_POSTS') or 0):
        print 'Truncating posts...'
        print truncate_posts(feature_similarity.get_clusters_to_delete(),
                             loopy)
    else:
        print 'Skipping truncate posts because TRUNCATE_POSTS env var is not set...'

    print 'FINISHED SIMILARITY PROCESSING'
    for k, v in domain_clust.get_clusters().iteritems():
        cluster = {}
        cluster['id'] = str(uuid.uuid4())
        cluster['term'] = k
        cluster['similar_ids'] = v['similar_ids']
        cluster['similar_post_ids'] = v['similar_post_ids']
        cluster['job_monitor_id'] = job['job_id']
        cluster['start_time_ms'] = job['start_time_ms']
        cluster['end_time_ms'] = job['end_time_ms']
        cluster['stats'] = v['stats']
        cluster['data_type'] = 'domain'

        try:
            loopy.post_result(result_url, cluster)
        except Exception as e:
            # TODO: we should set data = None when error.
            job['data'] = []
            job['state'] = 'error'
            job['error'] = e
            break
    else:  # no errors
        job['data'] = domain_clust.to_json()
        job['state'] = 'processed'
示例#13
0
sys.path.append('/Users/jgartner/projects/xdata/QCR/watchman/services/dr-manhattan/')
from louvaine import Louvaine

query_params = [{
        "query_type": "between",
        "property_name": "end_time_ms",
        "query_value": [1477580423000, 1477780423000]
    }]

lp_n = Loopy('http://localhost:3003/api/postsClusters', query_params)
com = Louvaine('http://localhost:3003/api/',
               'http://54.89.54.199:3003/api/extract/entities',
               'http://54.89.54.199:3003/api/geocoder/forward-geo')
while True:
    page = lp_n.get_next_page()
    if page is None:
        break
    for doc in page:
        com.add_node(doc)

lp_e = Loopy('http://localhost:3003/api/clusterLinks', query_params)
while True:
    page = lp_e.get_next_page()
    if page is None:
        break
    for doc in page:
        com.add_edge(doc)

com.save_communities()
#com.get_communities()
示例#14
0
def process_message(key, job):
    err_check(job)
    if job['state'] == 'error':
        return

    print 'FINDING SIMILARITY'
    print 'min_post set to %s' % job['min_post']
    hash_clust = HashtagClusters(float(job['min_post']), job['result_url'], job['start_time_ms'])

    query_params = [{
        "query_type": "between",
        "property_name": "timestamp_ms",
        "query_value": [job['start_time_ms'], job['end_time_ms']]
    }, {
        "query_type": "where",
        "property_name": "featurizer",
        "query_value": "hashtag"
    }, {
        "query_type": "neq",
        "property_name": "hashtags",
        "query_value": "null"
    }]

    if 'lang' in job:
        query_params.append({
            "query_type": "where",
            "property_name": "lang",
            "query_value": job['lang']
        })
    loopy = Loopy(job['query_url'], query_params)

    if loopy.result_count == 0:
        print "No data to process"
        job['data'] = []
        job['error'] = "No data found to process."
        job['state'] = 'error'
        return

    while True:
        print "Scrolling...{}".format(loopy.total_returned)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            hash_clust.process_vector(doc['id'], doc['post_id'], doc['hashtags'])

    print 'FINISHED SIMILARITY PROCESSING'
    for k, v in hash_clust.get_clusters().iteritems():
        cluster = {
            'id': str(uuid.uuid4()),
            'term': k,
            'similar_ids': v['similar_ids'],
            'similar_post_ids': v['similar_post_ids'],
            'job_monitor_id': job['job_id'],
            'start_time_ms': job['start_time_ms'],
            'end_time_ms': job['end_time_ms'],
            'stats': v['stats'],
            'data_type': 'hashtag'
        }

        try:
            loopy.post_result(job['result_url'], cluster)
        except Exception as e:
            # TODO: we should set data = None when error.
            job['data'] = []
            job['state'] = 'error'
            job['error'] = e
            break
    else: # no errors
        job['data'] = hash_clust.to_json()
        job['state'] = 'processed'