예제 #1
0
    def __init__(self, base_url, geo_url):
        self.graph = nx.Graph()
        self.nodes_detailed = {}
        self.geo_url = geo_url
        self.sf = SentimentFilter()

        if base_url[-1] == '/':
            self.url = base_url
        else:
            self.url = base_url + '/'
예제 #2
0
    def __init__(self, base_url, geo_url, geo_threshold):
        self.graph = nx.Graph()
        self.nodes_detailed = {}
        self.geo_url = geo_url
        self.geo_threshold = geo_threshold
        self.sf = SentimentFilter()
        self.ent_ext = EntityExtractor()

        if base_url[-1] == '/':
            self.url = base_url
        else:
            self.url = base_url + '/'
예제 #3
0
def main(s_lng='en', test_words=None):
    #s_path = '/Volumes/ed_00/data/raw_tweet_data/tweets_w_img_url/'
    s_path = '/Volumes/ed_00/data/arabic_json/'
    s_save = date.today().strftime('%b%d')+'_'+s_lng

    l_files = os.listdir(s_path)
    _X = []
    d_df = {}
    #raw_text = []
    t0 = time.time()
    tt = time.time()
    sent_filt = SentimentFilter()
    keys = set([])
    l_num = 0
    p_docs = []
    p_soc = []
    for s_file in l_files:
        if s_file[-4:] != 'json':
            continue
        f = open(s_path+s_file)
        for line in f:
            try:
                d0 = json.loads(line)
            except:
                continue
            if 'lang' not in d0.keys():
                continue
            if d0['lang'] != s_lng:
                continue
            txt = d0['text']
            if sent_filt.is_scoreable(txt, s_lng) is False:
                continue
            l_txt = sent_filt.tokenize(txt, s_lng)
            if test_words is not None and len(test_words)==2:
                if test_words[0] in l_txt:
                    p_docs.append(len(_X))
                if test_words[1] in l_txt:
                    p_soc.append(len(_X))
            _X.append(l_txt)
            l_num += 1
            if l_num %100==0:
                diff = time.time() - tt
                print "time for 100:", diff, "(total", l_num, ")"
                tt = time.time()
                sys.stdout.flush()
            for t in l_txt:
                if t in keys:
                    d_df[t] += 1
                else:
                    d_df[t] = 1
                    keys.add(t)

    diff = time.time()-t0
    print "\nTime to read in", l_num, "files", diff
    if l_test is not None:
        print "Number of " + l_test[0] + " tweets:", len(p_docs), ", number of " + l_test[1] + " tweets:", len(p_soc)

    print "Training Model"
    t0 = time.time()
    dimensions = 100
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(_X, min_count=100, size=dimensions)
    model.save('./models/' + s_save + 'word2vec')
    diff = time.time()-t0
    print "Time to train model:", diff
    print "Number of tweets:", len(_X)

    d_idf = {}
    model_vocab = set(model.vocab.keys())
    print "Terms in dict:", len(d_df.keys())
    for k, v in d_df.iteritems():
        if k not in model_vocab:
            continue
        freq = float(v)/float(l_num)
        if v > 100 and freq < 0.95:
            d_idf[k] = map(lambda x: x*log(1/freq), list(model[k]))
    with codecs.open('models/' + s_save, mode='w', encoding='utf-8') as outfile:
        outfile.write(json.dumps(d_idf))


    print "TEST cosine diff of test phrases on 100! combination of sentances"
    t0 = time.time()
    l_blm_cos, l_blmi_cos, l_blmif_cos = [], [], []
    l_soc_cos, l_soci_cos, l_socif_cos = [], [], []
    s_words = set(model.index2word)
    max_t1 = 100 if len(p_docs) > 100 else len(p_docs)
    max_t2 = 100 if len(p_soc) > 100 else len(p_soc)
    for i in range(max_t1):
        (v1, vi1, vif1) = vec_from_tweet(model, _X[p_docs[i]], dimensions, s_words, d_idf)
        for j in range(i+1, max_t1):
            (v2, vi2, vif2) = vec_from_tweet(model, _X[p_docs[j]], dimensions, s_words, d_idf)
            cos = get_cos(v1, v2)
            cosi = get_cos(vi1, vi2)
            cosif = get_cos(vif1, vif2)
            l_blm_cos.append(cos)
            l_blmi_cos.append(cosi)
            l_blmif_cos.append(cosif)
        for k in range(max_t2):
            (v3, vi3, vif3) = vec_from_tweet(model, _X[p_soc[k]], dimensions, s_words, d_idf)
            cos = get_cos(v1, v3)
            cosi = get_cos(vi1, vi3)
            cosif = get_cos(vif1, vif3)
            l_soc_cos.append(cos)
            l_soci_cos.append(cosi)
            l_socif_cos.append(cosif)

    if test_words is not None and len(test_words)==2:
        diff = time.time()-t0
        print "Time to test model:", diff
        bins = map(lambda x: x*0.01, range(101))
        plt.figure(1)
        plt.subplot(231)
        plt.hist(l_blm_cos, bins=bins)
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[0] + ", bag of words")
        plt.subplot(232)
        plt.hist(l_blmi_cos, bins=bins)
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[0] + ", tf-idf")
        plt.subplot(233)
        print l_blmif_cos
        plt.hist(l_blmif_cos, bins=bins)
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[0] + ", tf-idf, filtered")
        plt.subplot(234)
        plt.hist(l_soc_cos, bins=bins)
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[1] + ", bag of words")
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.subplot(235)
        plt.hist(l_soci_cos, bins=bins)
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[1] + ", tf-idf")
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.subplot(236)
        plt.hist(l_socif_cos, bins=bins)
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[1] + ", tf-idf, filtered")
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.show()
예제 #4
0
class Louvaine:
    def __init__(self, base_url, geo_url, geo_threshold):
        self.graph = nx.Graph()
        self.nodes_detailed = {}
        self.geo_url = geo_url
        self.geo_threshold = geo_threshold
        self.sf = SentimentFilter()
        self.ent_ext = EntityExtractor()

        if base_url[-1] == '/':
            self.url = base_url
        else:
            self.url = base_url + '/'

    def add_node(self, cluster):
        n_id = cluster['id']
        self.graph.add_node(n_id)
        self.nodes_detailed[n_id] = cluster

    def add_edge(self, c_link):
        self.graph.add_edge(c_link['source'], c_link['target'],
                            {'weight': c_link['weight']})

    def get_text_sum(self, cluster, r_o):
        n_posts = len(cluster['similar_post_ids'])
        l_sample = cluster['similar_post_ids']
        if n_posts > 30:
            l_sample = sample(cluster['similar_post_ids'], 30)
            n_posts = 30

        words = {}
        places = []
        websites = set([])
        r_o["campaigns"]["total"] += n_posts

        #TODO: fix query type once S.L. is fixed
        query_params = [{
            "query_type": "inq",
            "property_name": "post_id",
            "query_value": l_sample
        }]
        lp = Loopy(self.url + 'socialMediaPosts', query_params)
        page = lp.get_next_page()
        if page is None:
            return

        for doc in page:
            if doc['featurizer'] != cluster['data_type']:
                continue

            if 'campaigns' in doc:
                for cam in doc['campaigns']:
                    if cam in r_o["campaigns"]["ids"]:
                        r_o["campaigns"]["ids"][cam] += 1
                    else:
                        r_o["campaigns"]["ids"][cam] = 1

            locs = self.ent_ext.extract(doc['text'], tag='I-LOC')
            for loc in locs:
                print 'Location:', loc.encode('utf-8')
                try:
                    geos = Loopy.post(self.geo_url, json={'address': loc})
                    for place in geos:
                        places.append(place)
                        break
                except Exception as e:
                    print "error getting locations from geocoder...continuing.", e
                    traceback.print_exc()

            tokens = [
                w for w in self.sf.pres_tokenize(doc['text'], doc['lang'])
                if w not in stop_list
            ]
            for word in tokens:
                if word[0] == '#':
                    continue
                if word[0] == '@':
                    continue
                if word[:4] == 'http':
                    websites.add(word)
                    continue
                if word[:3] == 'www':
                    websites.add('http://' + word)
                    continue
                if word in words:
                    words[word] += 1
                else:
                    words[word] = 1

        for k, v in words.iteritems():
            k = remove_punctuation(k)
            if v < 5:
                continue
            if v in r_o['keywords']:
                r_o['keywords'][k] += v
            else:
                r_o['keywords'][k] = v

        for place in places:
            if type(place) is not dict:
                continue
            place_name = ''
            weight = 0.0
            if 'city' in place.keys():
                place_name = place['city'] + ' '
                weight += 1
            if 'state' in place.keys():
                place_name += place['state'] + ' '
                weight += .1
            if 'country' in place.keys():
                place_name += ' ' + place['country'] + ' '
                weight += .05
            if place_name in r_o['location']:
                r_o['location'][place_name]['weight'] += weight
            else:
                r_o['location'][place_name] = {
                    "type":
                    "inferred point",
                    "geo_type":
                    "point",
                    "coords": [{
                        "lat": place['latitude'],
                        "lng": place['longitude']
                    }],
                    "weight":
                    weight
                }

        r_o['location'] = dict((k, v) for k, v in r_o['location'].items()
                               if v['weight'] >= self.geo_threshold)

        for url in list(websites):
            r_o['urls'].add(url)

    def get_img_sum(self, cluster):
        n_posts = len(cluster['similar_post_ids'])
        l_sample = cluster['similar_post_ids']
        if n_posts > 100:
            l_sample = sample(cluster['similar_post_ids'], 100)

        imgs = set()

        #TODO: fix query type once S.L. is fixed
        for id in l_sample:
            query_params = [{
                "query_type": "between",
                "property_name": "post_id",
                "query_value": [id, id]
            }]
            lp = Loopy(self.url + 'socialMediaPosts', query_params)
            page = lp.get_next_page()
            if page is None:
                continue
            for doc in page:
                if 'primary_image_url' not in doc:
                    continue
                imgs.add(doc['primary_image_url'])
                break

        return imgs

    def get_communities(self):
        partition = community.best_partition(self.graph)
        d1 = {}

        print "Communities found, getting event summary information"
        n_nodes = len(self.graph.nodes())
        checkpoints = [.1, .25, .5, .75, .9, .95, .99, 1.1]
        ind_checked = 0
        n_checked = 0
        for n in self.graph.nodes():
            n_checked += 1
            while n_checked > checkpoints[ind_checked] * n_nodes:
                ind_checked += 1
                print "Finished {}% of nodes".format(
                    checkpoints[ind_checked - 1] * 100)

            images = set()
            com = str(partition[n])
            if n not in self.nodes_detailed:
                print "{} not found in detailed node list...why????".format(n)
                continue
            clust = self.nodes_detailed[n]
            if com in d1:
                d1[com]['cluster_ids'].append(n)
                d1[com]['topic_message_count'] += len(
                    clust['similar_post_ids'])
            else:
                d1[com] = {
                    'id': str(uuid.uuid4()),
                    'name': 'default',
                    'start_time_ms': clust['start_time_ms'],
                    'end_time_ms': clust['end_time_ms'],
                    'cluster_ids': [n],
                    'hashtags': {},
                    'keywords': {},
                    'campaigns': {
                        "total": 0,
                        'ids': {}
                    },
                    'urls': set([]),
                    'image_urls': [],
                    'location': {},
                    'importance_score': 1.0,
                    'topic_message_count': len(clust['similar_post_ids'])
                }

            #Expand Summary data (hashtags, keywords, images, urls, geo)
            if clust['data_type'] == 'hashtag':
                d1[com]['hashtags'][clust['term']] = len(
                    clust['similar_post_ids'])
                #Add full text analysis, many communities have no image/text nodes
                self.get_text_sum(clust, d1[com])
            elif clust['data_type'] == 'image':
                pass
            elif clust['data_type'] == 'text':
                self.get_text_sum(clust, d1[com])

            images |= self.get_img_sum(clust)

            d1[com]['image_urls'] = list(set(d1[com]['image_urls']) | images)

            #Make Sure Time is Correct
            if clust['start_time_ms'] < d1[com]['start_time_ms']:
                d1[com]['start_time_ms'] = clust['start_time_ms']
            if clust['end_time_ms'] > d1[com]['end_time_ms']:
                d1[com]['end_time_ms'] = clust['end_time_ms']

        print "Information collected, formatting output"

        #Cleanup -> transform dicst to order lists, sets to lists for easy javascript comprehension
        for com in d1.keys():
            l_camps = []
            if d1[com]['campaigns']['total'] != 0:
                l_camps = [{
                    k: 1. * v / float(d1[com]['campaigns']['total'])
                } for k, v in d1[com]['campaigns']['ids'].iteritems()]

            d1[com]['campaigns'] = l_camps

            # l_tags = map(lambda x: x[0], sorted([(k, v) for k, v in d1[com]['hashtags'].iteritems()], key=iget(1)))
            l_tags = sorted(list(d1[com]['hashtags'].iteritems()),
                            key=iget(1),
                            reverse=1)
            d1[com]['hashtags'] = l_tags[:100]  # slice

            # l_terms = map(lambda x: x[0], sorted([(k, v) for k, v in d1[com]['keywords'].iteritems()], key=lambda x: x[1]))
            l_terms = sorted(list(d1[com]['keywords'].iteritems()),
                             key=iget(1),
                             reverse=1)
            d1[com]['keywords'] = l_terms[:100]  # slice

            d1[com]['urls'] = list(d1[com]['urls'])

            temp = []
            for k, v in d1[com]['location'].iteritems():
                dt = v
                dt['label'] = k
                temp.append(dt)
            d1[com]['location'] = temp

        return d1
예제 #5
0
        job['data'] = []
        job['state'] = 'processed'
        return


if __name__ == '__main__':
    ar = argparse.ArgumentParser()
    ar.add_argument("-modelPath", help="Path to model (e.g. ./models)")
    ar.add_argument("-englishModel", help="Name of Engilsh model")
    ar.add_argument("-arabicModel", help="Name of Arabic model")
    args = ar.parse_args()
    print "Parsed args"
    global model_langs
    model_langs = ['en', 'ar']
    print "Making filter"
    global sent_filt
    sent_filt = SentimentFilter()
    global syntax_vectorizer
    syntax_vectorizer = {}

    if args.englishModel != '':
        syntax_vectorizer['en'] = SyntaxVectorizer(args.modelPath,
                                                   args.englishModel)
    if args.arabicModel != '':
        syntax_vectorizer['ar'] = SyntaxVectorizer(args.modelPath,
                                                   args.arabicModel)
    dispatcher = Dispatcher(redis_host='redis',
                            process_func=process_message,
                            queues=['genie:feature_txt'])
    dispatcher.start()
예제 #6
0
def main(s_lng='en', test_words=None):
    #s_path = '/Volumes/ed_00/data/raw_tweet_data/tweets_w_img_url/'
    s_path = '/Volumes/ed_00/data/arabic_json/'
    s_save = date.today().strftime('%b%d')+'_'+s_lng

    l_files = os.listdir(s_path)
    _X = []
    d_df = {}
    #raw_text = []
    t0 = time.time()
    tt = time.time()
    sent_filt = SentimentFilter()
    keys = set([])
    l_num = 0
    p_docs = []
    p_soc = []
    for s_file in l_files:
        if s_file[-4:] != 'json':
            continue
        f = open(s_path+s_file)
        for line in f:
            try:
                d0 = json.loads(line)
            except:
                continue
            if 'lang' not in d0.keys():
                continue
            if d0['lang'] != s_lng:
                continue
            txt = d0['text']
            if sent_filt.is_scoreable(txt, s_lng) is False:
                continue
            l_txt = sent_filt.tokenize(txt, s_lng)
            if test_words is not None and len(test_words)==2:
                if test_words[0] in l_txt:
                    p_docs.append(len(_X))
                if test_words[1] in l_txt:
                    p_soc.append(len(_X))
            _X.append(l_txt)
            l_num += 1
            if l_num %100==0:
                diff = time.time() - tt
                print "time for 100:", diff, "(total", l_num, ")"
                tt = time.time()
                sys.stdout.flush()
            for t in l_txt:
                if t in keys:
                    d_df[t] += 1
                else:
                    d_df[t] = 1
                    keys.add(t)

    diff = time.time()-t0
    print "\nTime to read in", l_num, "files", diff
    if l_test is not None:
        print "Number of " + l_test[0] + " tweets:", len(p_docs), ", number of " + l_test[1] + " tweets:", len(p_soc)

    print "Training Model"
    t0 = time.time()
    dimensions = 100
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(_X, min_count=100, size=dimensions)
    model.save('./models/' + s_save + 'word2vec')
    diff = time.time()-t0
    print "Time to train model:", diff
    print "Number of tweets:", len(_X)

    d_idf = {}
    model_vocab = set(model.vocab.keys())
    print "Terms in dict:", len(d_df.keys())
    for k, v in d_df.iteritems():
        if k not in model_vocab:
            continue
        freq = float(v)/float(l_num)
        if v > 100 and freq < 0.95:
            d_idf[k] = map(lambda x: x*log(1/freq), list(model[k]))
    with codecs.open('models/' + s_save, mode='w', encoding='utf-8') as outfile:
        outfile.write(json.dumps(d_idf))


    print "TEST cosine diff of test phrases on 100! combination of sentances"
    t0 = time.time()
    l_blm_cos, l_blmi_cos, l_blmif_cos = [], [], []
    l_soc_cos, l_soci_cos, l_socif_cos = [], [], []
    s_words = set(model.index2word)
    max_t1 = 100 if len(p_docs) > 100 else len(p_docs)
    max_t2 = 100 if len(p_soc) > 100 else len(p_soc)
    for i in range(max_t1):
        (v1, vi1, vif1) = vec_from_tweet(model, _X[p_docs[i]], dimensions, s_words, d_idf)
        for j in range(i+1, max_t1):
            (v2, vi2, vif2) = vec_from_tweet(model, _X[p_docs[j]], dimensions, s_words, d_idf)
            cos = get_cos(v1, v2)
            cosi = get_cos(vi1, vi2)
            cosif = get_cos(vif1, vif2)
            l_blm_cos.append(cos)
            l_blmi_cos.append(cosi)
            l_blmif_cos.append(cosif)
        for k in range(max_t2):
            (v3, vi3, vif3) = vec_from_tweet(model, _X[p_soc[k]], dimensions, s_words, d_idf)
            cos = get_cos(v1, v3)
            cosi = get_cos(vi1, vi3)
            cosif = get_cos(vif1, vif3)
            l_soc_cos.append(cos)
            l_soci_cos.append(cosi)
            l_socif_cos.append(cosif)

    if test_words is not None and len(test_words)==2:
        diff = time.time()-t0
        print "Time to test model:", diff
        bins = map(lambda x: x*0.01, range(101))
        plt.figure(1)
        plt.subplot(231)
        plt.hist(l_blm_cos, bins=bins)
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[0] + ", bag of words")
        plt.subplot(232)
        plt.hist(l_blmi_cos, bins=bins)
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[0] + ", tf-idf")
        plt.subplot(233)
        print l_blmif_cos
        plt.hist(l_blmif_cos, bins=bins)
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[0] + ", tf-idf, filtered")
        plt.subplot(234)
        plt.hist(l_soc_cos, bins=bins)
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[1] + ", bag of words")
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.subplot(235)
        plt.hist(l_soci_cos, bins=bins)
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[1] + ", tf-idf")
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.subplot(236)
        plt.hist(l_socif_cos, bins=bins)
        plt.title("Cos Similarity " + test_words[0] + " to " + test_words[1] + ", tf-idf, filtered")
        plt.yscale('log', nonposy='clip')
        x1, x2, y1, y2 = plt.axis()
        plt.axis((x1,x2,0.1,10000))
        plt.show()