def save_(tweet): new_dic = {} time_period = timeFormat.get_period(tweet['created_at']) if tweet['lang'] == 'en': pred_text = preporcessing.pre(tweet['text']) sentiment = sentiment_analysis.get_sentiment_scores(pred_text) new_dic = { '_id': tweet['id_str'], 'created_at': tweet['created_at'], 'text': tweet['text'], 'user': tweet['user'], 'geo': tweet['geo'], 'coordinates': tweet['coordinates'], 'place': tweet['place'], 'weekday': time_period[0], 'month': time_period[1], 'day': time_period[2], 'hour': time_period[3], 'year': time_period[4], 'negative': sentiment['neg'], 'positive': sentiment['pos'], 'neu': sentiment['neu'], 'compound': sentiment['compound'] } return new_dic
def update_db(db): for id in db: tweet = db[id] if 'compound' not in tweet.keys(): tweet['text'] if tweet['lang'] == 'en': pred_text = preporcessing.pre(tweet['text']) sentiment = get_sentiment_scores(pred_text) tweet['negative'] = sentiment['neg'] tweet['positive'] = sentiment['pos'] tweet['neu'] = sentiment['neu'] tweet['compound'] = sentiment['compound'] db.save(tweet)
with open(file_path, 'r') as f: # with open('/Users/jiaqili/Desktop/project/melbourne2015-01-01_2015-01-03.json', 'r')as f: line = f.readline() while line: l = line.strip('\n, ') if l.startswith('{') and l.endswith('}'): l = json.loads(l) tweet = l['doc'] time_period = timeFormat.get_period(tweet['created_at']) if tweet["geo"] != 'null': name = get_area(tweet['geo']['coordinates']) if name in config.coordinates.keys( ) and name not in config.L_name: db_name = name if tweet['lang'] == 'en': pred_text = preporcessing.pre(tweet['text']) sentiment = sentiment_analysis.get_sentiment_scores( pred_text) new_dic = { '_id': tweet['id_str'], 'created_at': tweet['created_at'], 'text': tweet['text'], 'user': tweet['user'], 'geo': tweet['geo'], 'coordinates': tweet['coordinates'], 'place': tweet['place'], 'weekday': time_period[0], 'month': time_period[1], 'day': time_period[2], 'hour': time_period[3], 'year': time_period[4],
def get_user_timeline_tweets(db_raw, api, city_name): result = db_raw.view('original_tweets/username_not_used') for res in result: id = res['id'] tweet = db_raw[id] name = tweet['user']['screen_name'] try: for friend in Cursor(api.friends, screen_name=name).items(200): friend_id = friend._json['id'] try: for friend_raw_tweet in Cursor( api.user_timeline, user_id=friend_id).items(200): friend_raw_tweet._json['_id'] = str( friend_raw_tweet._json['id']) friend_raw_tweet._json['username'] = True if city_name in city_list: city_name = 'melbourne' area_name = raw_tweet._json['place']['name'] new_dic = {} if raw_tweet._json['lang'] == 'en': time_period = timeFormat.get_period( friend_raw_tweet._json['created_at']) pred_text = pre(raw_tweet._json['text']) sentiment = get_sentiment_scores(pred_text) new_dic = { '_id': friend_raw_tweet._json['_id'], 'created_at': friend_raw_tweet._json['created_at'], 'text': friend_raw_tweet._json['text'], 'user': friend_raw_tweet._json['user'], 'geo': friend_raw_tweet._json['geo'], 'coordinates': friend_raw_tweet._json['coordinates'], 'place': friend_raw_tweet._json['place'], 'weekday': time_period[0], 'month': time_period[1], 'day': time_period[2], 'hour': time_period[3], 'year': time_period[4], 'negative': sentiment['neg'], 'positive': sentiment['pos'], 'neu': sentiment['neu'], 'compound': sentiment['compound'] } try: if len(new_dic.keys()) > 0 and area_name.lower( ) == city_name: db_raw.save(new_dic) except couchdb.http.ResourceConflict: pass except: pass except: pass try: for raw_tweet in Cursor(api.user_timeline, screen_name=name).items(200): # for tweet in tw.Cursor(self.api.search, q=query, lang=lang) # ''' # tweet_temp = {'id': status.id_str, 'user': status._json['user'], 'place': status._json['place'], # 'text': status.text, 'coordinates': status._json['coordinates']} # ''' # ''' raw_tweet._json['_id'] = str(raw_tweet._json['id']) raw_tweet._json['username'] = True area_name = raw_tweet._json['place']['name'] time_period = timeFormat.get_period(tweet['created_at']) if city_name in city_list: city_name = 'melbourne' new_dic = {} if raw_tweet._json['geo'] != 'null': get_area(raw_tweet._json['geo']['coordinate']) if raw_tweet._json['lang'] == 'en': pred_text = pre(raw_tweet._json['text']) sentiment = get_sentiment_scores(pred_text) new_dic = { '_id': raw_tweet._json['_id'], 'created_at': raw_tweet._json['created_at'], 'text': raw_tweet._json['text'], 'user': raw_tweet._json['user'], 'geo': raw_tweet._json['geo'], 'coordinates': raw_tweet._json['coordinates'], 'place': raw_tweet._json['place'], 'weekday': time_period[0], 'month': time_period[1], 'day': time_period[2], 'hour': time_period[3], 'year': time_period[4], 'negative': sentiment['neg'], 'positive': sentiment['pos'], 'neu': sentiment['neu'], 'compound': sentiment['compound'] } try: if len(new_dic.keys()) > 0 and area_name.lower( ) == city_name: db_raw.save(new_dic) except couchdb.http.ResourceConflict: pass doc = db_raw.get(id) doc['username'] = True db_raw.save(doc) except: pass
def get_topic(): # print(args) # conn = pymysql.connect(host='127.0.0.1', port=3306, user='******', passwd='toor', db='twitter', charset='utf8') # cursor = conn.cursor() # cursor.execute("set names utf8") # address = 'http://*****:*****@' + ip + ':5984/' # couch = couchdb.Server(address) # db = couch[name] # try: print('start tf_vectorizer') n_features = 500 tf_vectorizer = CountVectorizer(max_features=n_features, stop_words='english') # sql = """select t.* from (select processed_text, sa2_code as suburb from original_adelaide_coordinate where lang = 'en' and weekday = """ + args[0] + """ and created_at between '2014-01-01 00:00:00' and '2014-12-31 23:59:59' and sa2_code = '402041047') as t order by suburb""" # if args[0] != '-1': # sql = """select t.* from (select processed_text, sa2_code as suburb from original_""" + args[6] + """_coordinate where lang = 'en' and weekday = """ + args[0] + """ and created_at between '""" + args[1] + """ """ + args[2] + """' and '""" + args[3] + """ """ + args[4] + """' and sa2_code = '""" + args[5] + """') as t order by suburb""" # else: # sql = """select t.* from (select processed_text, sa2_code as suburb from original_""" + args[6] + """_coordinate where lang = 'en' and created_at between '""" + args[1] + """ """ + args[2] + """' and '""" + args[3] + """ """ + args[4] + """' and sa2_code = '""" + args[5] + """') as t order by suburb""" # cursor.execute(sql) # result = cursor.fetchall() # text = [] # for item in result: # text.append(item[0]) # try: # cursor.execute(sql) # conn.commit() # except: # conn.rollback() text = [] print('start getting texts') with open( "/Users/jiaqili/Desktop/project/melbourne2015-01-01_2015-01-03.json", 'r') as f: line = f.readline() while line: line = line.strip('\n, ') if line.startswith('{') and line.endswith('}'): line = json.loads(line) tweet = line['doc'] # pred_text=tweet['text'].split() pred_text = preporcessing.pre(tweet['text']) text.append(pred_text) # if len(text)<1000: line = f.readline() print(len(text)) print('start fit_transform') # a=text[0:1000] # print(a) tf = tf_vectorizer.fit_transform(text) print(type(tf)) # tf = tf_vectorizer.fit_transform([f.read()]) # n_topics = int(args[7]) n_topics = 10 print(n_topics) print('start lda') lda = LatentDirichletAllocation( n_components=n_topics, max_iter=50, # learning_method='online', learning_method='batch', learning_offset=50., random_state=0) print(type(lda)) print('something') lda.fit(tf) print('something') def print_top_words(model, feature_names, n_top_words): temp = {} for topic_idx, topic in enumerate(model.components_): key = "Topic %d:" % topic_idx value = " ".join([ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ]) temp[key] = value return print(temp) n_top_words = 10 print(n_top_words) tf_feature_names = tf_vectorizer.get_feature_names() print(tf_feature_names) # data = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer) # pyLDAvis.show(data) return print_top_words(lda, tf_feature_names, n_top_words)