示例#1
0
def process_message(key, job):
    # get features:
    print 'FINDING SIMILARITY'
    # do the work to find similarity
    error = validate_job(job)
    if error is not None:
        print "Error in Job : {}".format(error)
        job['data'] = []
        job['error'] = error
        job['state'] = 'error'
        return

    image_similarity = ImageSimilarity(float(job['similarity_threshold']), job['start_time_ms'], job['end_time_ms'],
                                       job['similarity_method'])
    query_params = [{
        "query_type": "between",
        "property_name": "timestamp_ms",
        "query_value": [job['start_time_ms'], job['end_time_ms']]
    }]
    if 'lang' in job.keys():
        query_params.append({
            "query_type": "where",
            "property_name": "lang",
            "query_value": job['lang']
        })
    loopy = Loopy(job['query_url'], query_params)

    if loopy.result_count == 0:
        print "No data to process"
        job['data'] = []
        job['error'] = "No data found to process."
        job['state'] = 'error'
        return

    while True:
        print "Scrolling...{}".format(loopy.current_page)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            if job['data_type'] == "text" and 'text_features' in doc and 'id' in doc and \
                    len(doc['text_features']) > 0:
                image_similarity.process_vector(doc['id'], doc['text_features'])
                continue
            if job['data_type'] == "image" and 'image_features' in doc and 'id' in doc and \
                    len(doc['image_features']) > 0:
                image_similarity.process_vector(doc['id'], doc['image_features'])

    clusters = image_similarity.get_clusters()

    print 'FINISHED SIMILARITY PROCESSING: found {} clusters'.format(len(clusters))
    for cluster in clusters:
        cluster['job_monitor_id'] = job['job_id']
        loopy.post_result(job['result_url'], cluster)
    job['data'] = image_similarity.to_json()
    job['state'] = 'processed'
示例#2
0
def process_message(key, job):
    if 'type' in job and job['type'] == 'featurizer':
        job['state'] = 'processed'
        return

    print 'Checking Parameters'
    err_check(job)
    if job['state'] == 'error':
        return

    print 'FINDING SIMILARITY'
    hash_clust = HashtagClusters(float(job['min_post']))

    query_params = [{
        "query_type": "between",
        "property_name": "timestamp_ms",
        "query_value": [job['start_time_ms'], job['end_time_ms']]
    }, {
        "query_type": "neq",
        "property_name": "hashtags",
        "query_value": "null"
    }]

    if 'lang' in job.keys():
        query_params.append({
            "query_type": "where",
            "property_name": "lang",
            "query_value": job['lang']
        })
    loopy = Loopy(job['query_url'], query_params)

    if loopy.result_count == 0:
        print "No data to process"
        job['data'] = []
        job['error'] = "No data found to process."
        job['state'] = 'error'
        return

    while True:
        print "Scrolling...{}".format(loopy.current_page)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            hash_clust.process_vector(doc['id'], doc['hashtags'])

    print 'FINISHED SIMILARITY PROCESSING'
    for k, v in hash_clust.hash_groups.iteritems():
        cluster = {}
        cluster['term'] = k
        cluster['post_ids'] = v
        cluster['job_monitor_id'] = job['job_id']
        loopy.post_result(job['result_url'], cluster)

    job['data'] = hash_clust.to_json()
    job['state'] = 'processed'
示例#3
0
def process_message(key, job):
    err_check(job)
    if job['state'] == 'error':
        return

    query_url = os.getenv('QUERY_URL', job['query_url'])
    result_url = os.getenv('RESULT_URL', job['result_url'])

    query_params = [{
        'query_type': 'where',
        'property_name': 'end_time_ms',
        'query_value': job['end_time_ms']
    }]

    print 'BEGIN LINKING CLUSTERS'
    linker = ClusterLinker(job.get('min_overlap', 0.6))
    loopy = Loopy(query_url, query_params)

    if loopy.result_count == 0:
        print 'No data to process'
        job['data'] = []
        job['error'] = 'No data found to process.'
        job['state'] = 'error'
        return

    while True:
        print 'Scrolling...{}'.format(loopy.current_page)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            linker.add_cluster(doc)

    print 'FINISHED LINKING CLUSTERS'
    for link in linker.get_links():
        loopy.post_result(result_url, link)

    job['data'] = json.dumps({})  # no need to save anything to job
    job['state'] = 'processed'
示例#4
0
def process_message(key, job):
    print 'FINDING SIMILARITY'
    error = validate_job(job)
    if error:
        print 'Error in Job : {}'.format(error)
        job['data'] = []
        job['error'] = error
        job['state'] = 'error'
        return

    feature_similarity = FeatureSimilarity(float(job['similarity_threshold']), job['start_time_ms'], job['end_time_ms'])
    query_params = [{
        'query_type': 'between',
        'property_name': 'timestamp_ms',
        'query_value': [job['start_time_ms'], job['end_time_ms']],
        'featurizer': job['data_type']
    }]
    if 'lang' in job:
        query_params.append({
            'query_type': 'where',
            'property_name': 'lang',
            'query_value': job['lang']
        })

    loopy = Loopy(job['query_url'], query_params)

    if loopy.result_count == 0:
        print 'No data to process'
        job['data'] = []
        job['error'] = 'No data found to process.'
        job['state'] = 'error'
        return

    while True:
        print 'Scrolling...{}'.format(loopy.total_returned)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            if job['data_type'] == 'text' and 'text_features' in doc and 'id' in doc and \
                    len(doc['text_features']) > 0:
                feature_similarity.process_vector(doc['id'], doc['post_id'], doc['text_features'])
                continue
            if job['data_type'] == 'image' and 'image_features' in doc and 'id' in doc and \
                    len(doc['image_features']) > 0 and 'primary_image_url' in doc and \
                    doc['primary_image_url']:
                feature_similarity.process_vector(doc['id'], doc['post_id'], doc['image_features'],
                                                  doc['primary_image_url'])

    clusters = feature_similarity.get_clusters()

    for cluster in clusters:
        cluster['job_monitor_id'] = job['job_id']
        cluster['data_type'] = job['data_type']
        loopy.post_result(job['result_url'], cluster)

    job['data'] = feature_similarity.to_json()
    job['state'] = 'processed'

    print 'FINISHED SIMILARITY PROCESSING: found {} clusters'.format(len(clusters))
示例#5
0
def process_message(key, job):
    print "absentfriends/main.py:process_message" + repr((key, job))
    # if type == 'featurizer', immediately process and return b/c domains
    # are not featurized. allows system to continue with clustering process.
    if job.get('type') == 'featurizer':
        job['state'] = 'processed'
        job['data'] = []
        return

    err_check(job)
    if job['state'] == 'error':
        return

    query_url = os.getenv('QUERY_URL', job['query_url'])
    result_url = os.getenv('RESULT_URL', job['result_url'])

    print 'FINDING SIMILARITY'
    print 'min_post set to %s' % job['min_post']
    domain_clust = DomainClusters(float(job['min_post']), query_url,
                                  job['start_time_ms'])

    query_params = [{
        "query_type": "between",
        "property_name": "timestamp_ms",
        "query_value": [job['start_time_ms'], job['end_time_ms']]
    }, {
        "query_type": "where",
        "property_name": "featurizer",
        "query_value": "domain"
    }]

    loopy = Loopy(query_url, query_params)

    if loopy.result_count == 0:
        print "No data to process"
        job['data'] = []
        job['error'] = "No data found to process."
        job['state'] = 'error'
        return

    while True:
        print "Scrolling...{}".format(loopy.total_returned)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            domains = get_domains(doc['image_urls'])
            if len(domains) > 0:
                domain_clust.process_vector(doc['id'], doc['post_id'], domains)

    if int(os.getenv('TRUNCATE_POSTS') or 0):
        print 'Truncating posts...'
        print truncate_posts(feature_similarity.get_clusters_to_delete(),
                             loopy)
    else:
        print 'Skipping truncate posts because TRUNCATE_POSTS env var is not set...'

    print 'FINISHED SIMILARITY PROCESSING'
    for k, v in domain_clust.get_clusters().iteritems():
        cluster = {}
        cluster['id'] = str(uuid.uuid4())
        cluster['term'] = k
        cluster['similar_ids'] = v['similar_ids']
        cluster['similar_post_ids'] = v['similar_post_ids']
        cluster['job_monitor_id'] = job['job_id']
        cluster['start_time_ms'] = job['start_time_ms']
        cluster['end_time_ms'] = job['end_time_ms']
        cluster['stats'] = v['stats']
        cluster['data_type'] = 'domain'

        try:
            loopy.post_result(result_url, cluster)
        except Exception as e:
            # TODO: we should set data = None when error.
            job['data'] = []
            job['state'] = 'error'
            job['error'] = e
            break
    else:  # no errors
        job['data'] = domain_clust.to_json()
        job['state'] = 'processed'
示例#6
0
def process_message(key, job):
    err_check(job)
    if job['state'] == 'error':
        return

    print 'FINDING SIMILARITY'
    print 'min_post set to %s' % job['min_post']
    hash_clust = HashtagClusters(float(job['min_post']), job['result_url'], job['start_time_ms'])

    query_params = [{
        "query_type": "between",
        "property_name": "timestamp_ms",
        "query_value": [job['start_time_ms'], job['end_time_ms']]
    }, {
        "query_type": "where",
        "property_name": "featurizer",
        "query_value": "hashtag"
    }, {
        "query_type": "neq",
        "property_name": "hashtags",
        "query_value": "null"
    }]

    if 'lang' in job:
        query_params.append({
            "query_type": "where",
            "property_name": "lang",
            "query_value": job['lang']
        })
    loopy = Loopy(job['query_url'], query_params)

    if loopy.result_count == 0:
        print "No data to process"
        job['data'] = []
        job['error'] = "No data found to process."
        job['state'] = 'error'
        return

    while True:
        print "Scrolling...{}".format(loopy.total_returned)
        page = loopy.get_next_page()
        if page is None:
            break
        # Do something with the obtained page
        for doc in page:
            hash_clust.process_vector(doc['id'], doc['post_id'], doc['hashtags'])

    print 'FINISHED SIMILARITY PROCESSING'
    for k, v in hash_clust.get_clusters().iteritems():
        cluster = {
            'id': str(uuid.uuid4()),
            'term': k,
            'similar_ids': v['similar_ids'],
            'similar_post_ids': v['similar_post_ids'],
            'job_monitor_id': job['job_id'],
            'start_time_ms': job['start_time_ms'],
            'end_time_ms': job['end_time_ms'],
            'stats': v['stats'],
            'data_type': 'hashtag'
        }

        try:
            loopy.post_result(job['result_url'], cluster)
        except Exception as e:
            # TODO: we should set data = None when error.
            job['data'] = []
            job['state'] = 'error'
            job['error'] = e
            break
    else: # no errors
        job['data'] = hash_clust.to_json()
        job['state'] = 'processed'