def __init__(self, base_url, geo_url): self.graph = nx.Graph() self.nodes_detailed = {} self.geo_url = geo_url self.sf = SentimentFilter() if base_url[-1] == '/': self.url = base_url else: self.url = base_url + '/'
def __init__(self, base_url, geo_url, geo_threshold): self.graph = nx.Graph() self.nodes_detailed = {} self.geo_url = geo_url self.geo_threshold = geo_threshold self.sf = SentimentFilter() self.ent_ext = EntityExtractor() if base_url[-1] == '/': self.url = base_url else: self.url = base_url + '/'
def main(s_lng='en', test_words=None): #s_path = '/Volumes/ed_00/data/raw_tweet_data/tweets_w_img_url/' s_path = '/Volumes/ed_00/data/arabic_json/' s_save = date.today().strftime('%b%d')+'_'+s_lng l_files = os.listdir(s_path) _X = [] d_df = {} #raw_text = [] t0 = time.time() tt = time.time() sent_filt = SentimentFilter() keys = set([]) l_num = 0 p_docs = [] p_soc = [] for s_file in l_files: if s_file[-4:] != 'json': continue f = open(s_path+s_file) for line in f: try: d0 = json.loads(line) except: continue if 'lang' not in d0.keys(): continue if d0['lang'] != s_lng: continue txt = d0['text'] if sent_filt.is_scoreable(txt, s_lng) is False: continue l_txt = sent_filt.tokenize(txt, s_lng) if test_words is not None and len(test_words)==2: if test_words[0] in l_txt: p_docs.append(len(_X)) if test_words[1] in l_txt: p_soc.append(len(_X)) _X.append(l_txt) l_num += 1 if l_num %100==0: diff = time.time() - tt print "time for 100:", diff, "(total", l_num, ")" tt = time.time() sys.stdout.flush() for t in l_txt: if t in keys: d_df[t] += 1 else: d_df[t] = 1 keys.add(t) diff = time.time()-t0 print "\nTime to read in", l_num, "files", diff if l_test is not None: print "Number of " + l_test[0] + " tweets:", len(p_docs), ", number of " + l_test[1] + " tweets:", len(p_soc) print "Training Model" t0 = time.time() dimensions = 100 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = Word2Vec(_X, min_count=100, size=dimensions) model.save('./models/' + s_save + 'word2vec') diff = time.time()-t0 print "Time to train model:", diff print "Number of tweets:", len(_X) d_idf = {} model_vocab = set(model.vocab.keys()) print "Terms in dict:", len(d_df.keys()) for k, v in d_df.iteritems(): if k not in model_vocab: continue freq = float(v)/float(l_num) if v > 100 and freq < 0.95: d_idf[k] = map(lambda x: x*log(1/freq), list(model[k])) with codecs.open('models/' + s_save, mode='w', encoding='utf-8') as outfile: outfile.write(json.dumps(d_idf)) print "TEST cosine diff of test phrases on 100! combination of sentances" t0 = time.time() l_blm_cos, l_blmi_cos, l_blmif_cos = [], [], [] l_soc_cos, l_soci_cos, l_socif_cos = [], [], [] s_words = set(model.index2word) max_t1 = 100 if len(p_docs) > 100 else len(p_docs) max_t2 = 100 if len(p_soc) > 100 else len(p_soc) for i in range(max_t1): (v1, vi1, vif1) = vec_from_tweet(model, _X[p_docs[i]], dimensions, s_words, d_idf) for j in range(i+1, max_t1): (v2, vi2, vif2) = vec_from_tweet(model, _X[p_docs[j]], dimensions, s_words, d_idf) cos = get_cos(v1, v2) cosi = get_cos(vi1, vi2) cosif = get_cos(vif1, vif2) l_blm_cos.append(cos) l_blmi_cos.append(cosi) l_blmif_cos.append(cosif) for k in range(max_t2): (v3, vi3, vif3) = vec_from_tweet(model, _X[p_soc[k]], dimensions, s_words, d_idf) cos = get_cos(v1, v3) cosi = get_cos(vi1, vi3) cosif = get_cos(vif1, vif3) l_soc_cos.append(cos) l_soci_cos.append(cosi) l_socif_cos.append(cosif) if test_words is not None and len(test_words)==2: diff = time.time()-t0 print "Time to test model:", diff bins = map(lambda x: x*0.01, range(101)) plt.figure(1) plt.subplot(231) plt.hist(l_blm_cos, bins=bins) plt.yscale('log', nonposy='clip') x1, x2, y1, y2 = plt.axis() plt.axis((x1,x2,0.1,10000)) plt.title("Cos Similarity " + test_words[0] + " to " + test_words[0] + ", bag of words") plt.subplot(232) plt.hist(l_blmi_cos, bins=bins) plt.yscale('log', nonposy='clip') x1, x2, y1, y2 = plt.axis() plt.axis((x1,x2,0.1,10000)) plt.title("Cos Similarity " + test_words[0] + " to " + test_words[0] + ", tf-idf") plt.subplot(233) print l_blmif_cos plt.hist(l_blmif_cos, bins=bins) plt.yscale('log', nonposy='clip') x1, x2, y1, y2 = plt.axis() plt.axis((x1,x2,0.1,10000)) plt.title("Cos Similarity " + test_words[0] + " to " + test_words[0] + ", tf-idf, filtered") plt.subplot(234) plt.hist(l_soc_cos, bins=bins) plt.title("Cos Similarity " + test_words[0] + " to " + test_words[1] + ", bag of words") plt.yscale('log', nonposy='clip') x1, x2, y1, y2 = plt.axis() plt.axis((x1,x2,0.1,10000)) plt.subplot(235) plt.hist(l_soci_cos, bins=bins) plt.title("Cos Similarity " + test_words[0] + " to " + test_words[1] + ", tf-idf") plt.yscale('log', nonposy='clip') x1, x2, y1, y2 = plt.axis() plt.axis((x1,x2,0.1,10000)) plt.subplot(236) plt.hist(l_socif_cos, bins=bins) plt.title("Cos Similarity " + test_words[0] + " to " + test_words[1] + ", tf-idf, filtered") plt.yscale('log', nonposy='clip') x1, x2, y1, y2 = plt.axis() plt.axis((x1,x2,0.1,10000)) plt.show()
class Louvaine: def __init__(self, base_url, geo_url, geo_threshold): self.graph = nx.Graph() self.nodes_detailed = {} self.geo_url = geo_url self.geo_threshold = geo_threshold self.sf = SentimentFilter() self.ent_ext = EntityExtractor() if base_url[-1] == '/': self.url = base_url else: self.url = base_url + '/' def add_node(self, cluster): n_id = cluster['id'] self.graph.add_node(n_id) self.nodes_detailed[n_id] = cluster def add_edge(self, c_link): self.graph.add_edge(c_link['source'], c_link['target'], {'weight': c_link['weight']}) def get_text_sum(self, cluster, r_o): n_posts = len(cluster['similar_post_ids']) l_sample = cluster['similar_post_ids'] if n_posts > 30: l_sample = sample(cluster['similar_post_ids'], 30) n_posts = 30 words = {} places = [] websites = set([]) r_o["campaigns"]["total"] += n_posts #TODO: fix query type once S.L. is fixed query_params = [{ "query_type": "inq", "property_name": "post_id", "query_value": l_sample }] lp = Loopy(self.url + 'socialMediaPosts', query_params) page = lp.get_next_page() if page is None: return for doc in page: if doc['featurizer'] != cluster['data_type']: continue if 'campaigns' in doc: for cam in doc['campaigns']: if cam in r_o["campaigns"]["ids"]: r_o["campaigns"]["ids"][cam] += 1 else: r_o["campaigns"]["ids"][cam] = 1 locs = self.ent_ext.extract(doc['text'], tag='I-LOC') for loc in locs: print 'Location:', loc.encode('utf-8') try: geos = Loopy.post(self.geo_url, json={'address': loc}) for place in geos: places.append(place) break except Exception as e: print "error getting locations from geocoder...continuing.", e traceback.print_exc() tokens = [ w for w in self.sf.pres_tokenize(doc['text'], doc['lang']) if w not in stop_list ] for word in tokens: if word[0] == '#': continue if word[0] == '@': continue if word[:4] == 'http': websites.add(word) continue if word[:3] == 'www': websites.add('http://' + word) continue if word in words: words[word] += 1 else: words[word] = 1 for k, v in words.iteritems(): k = remove_punctuation(k) if v < 5: continue if v in r_o['keywords']: r_o['keywords'][k] += v else: r_o['keywords'][k] = v for place in places: if type(place) is not dict: continue place_name = '' weight = 0.0 if 'city' in place.keys(): place_name = place['city'] + ' ' weight += 1 if 'state' in place.keys(): place_name += place['state'] + ' ' weight += .1 if 'country' in place.keys(): place_name += ' ' + place['country'] + ' ' weight += .05 if place_name in r_o['location']: r_o['location'][place_name]['weight'] += weight else: r_o['location'][place_name] = { "type": "inferred point", "geo_type": "point", "coords": [{ "lat": place['latitude'], "lng": place['longitude'] }], "weight": weight } r_o['location'] = dict((k, v) for k, v in r_o['location'].items() if v['weight'] >= self.geo_threshold) for url in list(websites): r_o['urls'].add(url) def get_img_sum(self, cluster): n_posts = len(cluster['similar_post_ids']) l_sample = cluster['similar_post_ids'] if n_posts > 100: l_sample = sample(cluster['similar_post_ids'], 100) imgs = set() #TODO: fix query type once S.L. is fixed for id in l_sample: query_params = [{ "query_type": "between", "property_name": "post_id", "query_value": [id, id] }] lp = Loopy(self.url + 'socialMediaPosts', query_params) page = lp.get_next_page() if page is None: continue for doc in page: if 'primary_image_url' not in doc: continue imgs.add(doc['primary_image_url']) break return imgs def get_communities(self): partition = community.best_partition(self.graph) d1 = {} print "Communities found, getting event summary information" n_nodes = len(self.graph.nodes()) checkpoints = [.1, .25, .5, .75, .9, .95, .99, 1.1] ind_checked = 0 n_checked = 0 for n in self.graph.nodes(): n_checked += 1 while n_checked > checkpoints[ind_checked] * n_nodes: ind_checked += 1 print "Finished {}% of nodes".format( checkpoints[ind_checked - 1] * 100) images = set() com = str(partition[n]) if n not in self.nodes_detailed: print "{} not found in detailed node list...why????".format(n) continue clust = self.nodes_detailed[n] if com in d1: d1[com]['cluster_ids'].append(n) d1[com]['topic_message_count'] += len( clust['similar_post_ids']) else: d1[com] = { 'id': str(uuid.uuid4()), 'name': 'default', 'start_time_ms': clust['start_time_ms'], 'end_time_ms': clust['end_time_ms'], 'cluster_ids': [n], 'hashtags': {}, 'keywords': {}, 'campaigns': { "total": 0, 'ids': {} }, 'urls': set([]), 'image_urls': [], 'location': {}, 'importance_score': 1.0, 'topic_message_count': len(clust['similar_post_ids']) } #Expand Summary data (hashtags, keywords, images, urls, geo) if clust['data_type'] == 'hashtag': d1[com]['hashtags'][clust['term']] = len( clust['similar_post_ids']) #Add full text analysis, many communities have no image/text nodes self.get_text_sum(clust, d1[com]) elif clust['data_type'] == 'image': pass elif clust['data_type'] == 'text': self.get_text_sum(clust, d1[com]) images |= self.get_img_sum(clust) d1[com]['image_urls'] = list(set(d1[com]['image_urls']) | images) #Make Sure Time is Correct if clust['start_time_ms'] < d1[com]['start_time_ms']: d1[com]['start_time_ms'] = clust['start_time_ms'] if clust['end_time_ms'] > d1[com]['end_time_ms']: d1[com]['end_time_ms'] = clust['end_time_ms'] print "Information collected, formatting output" #Cleanup -> transform dicst to order lists, sets to lists for easy javascript comprehension for com in d1.keys(): l_camps = [] if d1[com]['campaigns']['total'] != 0: l_camps = [{ k: 1. * v / float(d1[com]['campaigns']['total']) } for k, v in d1[com]['campaigns']['ids'].iteritems()] d1[com]['campaigns'] = l_camps # l_tags = map(lambda x: x[0], sorted([(k, v) for k, v in d1[com]['hashtags'].iteritems()], key=iget(1))) l_tags = sorted(list(d1[com]['hashtags'].iteritems()), key=iget(1), reverse=1) d1[com]['hashtags'] = l_tags[:100] # slice # l_terms = map(lambda x: x[0], sorted([(k, v) for k, v in d1[com]['keywords'].iteritems()], key=lambda x: x[1])) l_terms = sorted(list(d1[com]['keywords'].iteritems()), key=iget(1), reverse=1) d1[com]['keywords'] = l_terms[:100] # slice d1[com]['urls'] = list(d1[com]['urls']) temp = [] for k, v in d1[com]['location'].iteritems(): dt = v dt['label'] = k temp.append(dt) d1[com]['location'] = temp return d1
job['data'] = [] job['state'] = 'processed' return if __name__ == '__main__': ar = argparse.ArgumentParser() ar.add_argument("-modelPath", help="Path to model (e.g. ./models)") ar.add_argument("-englishModel", help="Name of Engilsh model") ar.add_argument("-arabicModel", help="Name of Arabic model") args = ar.parse_args() print "Parsed args" global model_langs model_langs = ['en', 'ar'] print "Making filter" global sent_filt sent_filt = SentimentFilter() global syntax_vectorizer syntax_vectorizer = {} if args.englishModel != '': syntax_vectorizer['en'] = SyntaxVectorizer(args.modelPath, args.englishModel) if args.arabicModel != '': syntax_vectorizer['ar'] = SyntaxVectorizer(args.modelPath, args.arabicModel) dispatcher = Dispatcher(redis_host='redis', process_func=process_message, queues=['genie:feature_txt']) dispatcher.start()