def get_all_dataFrame(data_path): """ 根据所有的hash之后的文件进行为每一个文件构建一个dataFrame,最后所有的dataFrame放入一个dict中,key名即为文件的名称 :param data_path: hash文件存储的路径。 :return:保存dataFrame的dict """ file_name_list = get_dirlist(path=data_path, key_word_list=['hash_qianlong']) dataFrame_dict = {} index = 0 for file_name in file_name_list: index += 1 write_log(log_file_name='find_root_tweet.log',log_file_path=os.getcwd(),information=str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...') print time.ctime(), str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...' data = pd.read_csv(data_path + file_name, header = None) data.columns = ['tweet_id', 'origin_tweet_id', 'from_user','from_user_id','to_user','to_user_id', 'tweet_time', 'origin_tweet_time', 'type'] data = data[data.origin_tweet_time != 'null'] data = data[data.type != 'mention'] del data['from_user'] del data['from_user_id'] del data['to_user'] del data['to_user_id'] data.index = data.tweet_id dataFrame_dict[file_name] = data write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(),information='tweet_dataFrame has been built, total number:') return dataFrame_dict
def get_dubai_actor_dict(file_name_list, file_path): """ put all twitter's actors of Dubai into one list, which was used to judge the originTweet whether is a tweet of Dubai. :param file_name_list: the list of all file names. :param file_path: the path of directory where all files are saved. :return: the list of all Dubai's actors. """ actor_dict = {} index = 0 for file_name in file_name_list: actor_dict[file_name] = list() for file_name in file_name_list: index = index + 1 actor_list = [] write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information=str(index) + ': BUILDING actor list of Dubai, file:' + file_name + ' is being processing...') print str( index ), ': BUILDING actor dict of Dubai, file:' + file_name + ' is being processing...' file = open(file_path + file_name, 'r') for line in file: row = json.loads(line) if row['actor']['id'] not in actor_list: actor_list.append(row['actor']['id']) else: pass file.close() actor_dict[file_name] = actor_list return actor_dict
def main(): file_path = 'D:/LiuQL/eHealth/twitter/visualization/node_edge/' node_file_name = 'labeled_community_nodes_2000.csv' cohesion_type = 'hash' edge_file_name = 'community_edges_2000.csv' edge_file_path = 'D:/test/edge_hash/' edge_file_key_word_list = ['edge_hash_', '.csv'] # Dubai server # file_path ='/pegasus/harir/Qianlong/data/network/' # node_file_name ='total_node_degree.csv' # cohesion_type = 'hash' # edge_file_name = 'total_edge_weight.csv' # edge_file_path = '/pegasus/harir/Qianlong/data/network/edge_hash/' # edge_file_key_word_list = ['edge_hash_','.csv'] write_log(log_file_name='network_cohesion_community_thread.log', log_file_path=os.getcwd(), information='*' * 50 + 'Starting Calculate cohesion' + '*' * 50) cohesion_file = 'cohesion_community_thread.json' file = open(os.getcwd() + '/' + cohesion_file, 'wb') file.close() calculate_cohesion_whole_network(file_path=file_path, node_file_name=node_file_name, cohesion_type=cohesion_type, cohesion_file = cohesion_file,edge_file_name=None, edge_file_path=edge_file_path, edge_file_key_word_list=edge_file_key_word_list) write_log(log_file_name='network_cohesion_community_thread.log', log_file_path=os.getcwd(), information='*' * 20 + 'Program Done' + '*' * 20 + '\n' * 4)
def main(): write_log( log_file_name='find_verified_user.log', log_file_path=os.getcwd(), information='###################program start#####################.') path_data = 'D:/LiuQL/eHealth/twitter/data/data_origin/' path_save_to = 'D:/LiuQL/eHealth/twitter/data/data_origin/' # path_data = '/pegasus/twitter-p-or-t-uae-201603.json.dxb/' # path_save_to ='/pegasus/harir/Qianlong/data/network/' duplicate_user_file = 'user_contain_duplicates.txt' all_user_file = 'user_all.txt' verified_user_file = 'user_verified.txt' find_user(path_data=path_data, path_save_to=path_save_to, file_save_to=duplicate_user_file) drop_duplicate_user(path_data=path_save_to, path_save_to=path_save_to, actor_file=duplicate_user_file, all_user_file=all_user_file, verified_user_file=verified_user_file) write_log( log_file_name='find_verified_user.log', log_file_path=os.getcwd(), information='###################program finished#####################.' + '\n' * 5)
def run(self): if self.number_of_edges == 0: self.get_community_edges() self.calculate_degree() else: pass user_id_list = list(set(list(self.node_dataFrame.user_id))) temp_number_i_j = 0 for index_first in range(0, len(user_id_list), 1): for index_second in range(0, len(user_id_list), 1): if index_first != index_second: self.number_i_j = self.number_i_j + 1 node_i = user_id_list[index_first] node_j = user_id_list[index_second] node_i_out_degree = self.node_dataFrame.out_degree[node_i] node_j_in_degree = self.node_dataFrame.in_degree[node_j] wether_edge = self.wether_interaction_between_nodes(node_i=node_i, node_j=node_j) self.cohesion = self.cohesion + (wether_edge - float( node_i_out_degree) * node_j_in_degree / self.number_of_edges) / self.number_of_edges if wether_edge == 1.0: self.number_i_j_edge = self.number_i_j_edge + 1 print '\n' + '*' * 140 print 'community id:', self.community_id, '; node_i:', node_i, '; node_j:', node_j, '; cohesion:', self.cohesion, '; index first:', index_first, '; index_second:', index_second print 'node i out_degree:', node_i_out_degree, '; node j in_degree:', node_j_in_degree, '; wether exits edge:', wether_edge, ': number_i_j_edge:', self.number_i_j_edge, '; number_i_j:', self.number_i_j else: print '\n' + '*' * 140 print 'community id:', self.community_id, '; node_i:', node_i, '; node_j:', node_j, '; cohesion:', self.cohesion, '; index first:', index_first, '; index_second:', index_second print 'node i out_degree:', node_i_out_degree, '; node j in_degree:', node_j_in_degree, '; wether exits edge:', wether_edge, ': number_i_j_edge:', self.number_i_j_edge, '; number_i_j:', self.number_i_j if self.number_i_j - temp_number_i_j >= 100000: temp_number_i_j = self.number_i_j write_log(log_file_name='network_cohesion_community_thread.log', log_file_path=os.getcwd(), information='Calculating cohesion. Cohesion:' + str( self.cohesion) + '; community_id:' + str( self.community_id) + ' Number_i_j:' + str(self.number_i_j)) else: pass self.lock.acquire() # write_log(log_file_name='network_cohesion_community_thread.log', log_file_path=os.getcwd(),information='################ Community_id: ' + str(self.community_id) + ' is over. cohesion is: ' + str(self.cohesion) + '#####################') cohesion_dict = {"community_id":self.community_id,"cohesion": self.cohesion, "number_of_edges": self.number_of_edges, "number_i_j": self.number_i_j, "number_i_j_edge": self.number_i_j_edge} cohesion_dict = dict( community_id = int(self.community_id), cohesion = self.cohesion, number_of_edges = int(self.number_of_edges), number_i_j = int(self.number_i_j), number_i_j_edge = int(self.number_i_j_edge) ) write_log(log_file_name='network_cohesion_community_thread.log', log_file_path=os.getcwd(),information='################ Community_id: ' + str(self.community_id) + ' is over.' + str(cohesion_dict)) print cohesion_dict row = json.dumps(cohesion_dict) + '\n' file = open(os.getcwd() + '/' + self.cohesion_file, 'a+') file.write(row) # file.write(str(cohesion_dict) + '\n') file.close() self.lock.release()
def add_origin_tweet_to_dataFrame(file_name_list, file_path, tweet_dataFrame_dict, actor_dict): """ add originTweet to the tweet_dataFrame_list. :param file_name_list: the list of all files' name :param file_path: the path of the directory where all files are. :param tweet_dataFrame_list: tweet_dataFrame_list. :param tweet_dataFrame_index_list: index list. :param actor_list: the list of all Dubai's actors :return: updated tweet_dataFrame_list, updated tweetDataFrame_index_list """ index = 0 columns = [ 'tweet_id', 'start_time', 'end_time', 'reply_count', 'retweet_count' ] for file_name in file_name_list: index = index + 1 write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information=str(index) + ': Adding originTweet to tweet_dataFrame file:' + file_name + ' is being processing...') print str( index ), ': Adding originTweet to tweet_dataFrame file:' + file_name + ' is being processing...' file = open(file_path + file_name, 'r') for line in file: row = json.loads(line) if row['type'] == 'retweet': origin_actor_id = row['originActor']['id'] origin_tweet_id = row['originTweet']['id'] actor_index = whether_in_dict(type_dict='actor_dict', str=origin_actor_id, dictionary=actor_dict) tweet_index = whether_in_dict( type_dict='index_dict', str=origin_tweet_id, dictionary=get_tweet_dataFrame_index_dict( tweet_dataFrame_dict)) if actor_index != None and tweet_index == None: new_line = pd.DataFrame(data=[[ row['originTweet']['id'], row['originTweet']['postedTime'], row['originTweet']['postedTime'], 0.0, 1.0 ]], index=[row['originTweet']['id']], columns=columns) tweet_dataFrame_dict[file_name] = tweet_dataFrame_dict[ file_name].append(new_line) else: pass file.close() return tweet_dataFrame_dict
def calculate_lifecycle(): write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(), information='############################## start program ################################') data_path = 'D:/LiuQL/eHealth/twitter/data/data_hash/' path_save_to = 'D:/LiuQL/eHealth/twitter/data/data_hash/result/' path_save_to = 'D:/LiuQL/eHealth/twitter/data/' file_name_save_to = 'tweet_originTweet_error.csv' # data_path = '/pegasus/harir/Qianlong/data/hash/' # path_save_to = '/pegasus/harir/Qianlong/data/project_data/twitter_hash_dataFrame/' dataFrame_dict = get_all_dataFrame(data_path=data_path) print 'tweet_dataFrame has been built.' build_tweet(dataFrame_dict=dataFrame_dict,path_save_to=path_save_to, file_name_save_to=file_name_save_to) write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(), information='############################## program end ################################' + '\n' * 4)
def build_tweet(dataFrame_dict,path_save_to,file_name_save_to): """ 为每一条tweet找到其root tweet,并将结果保存成(tweet_id, tweet_time, root_tweet_id, root_tweet_time)格式。而且为dataFrame_dict中每一个dataFrame保存一个文件,文件名称和其key相同,也即和源文件相同。 :param dataFrame_dict:包含初始文件数据的dataFrame_dict :param path_save_to:需要将结果保存的路径。 :param file_name_save_to:出错数据保存的名称。 :return:无返回内容。 """ # lifecycle_dataFrame = pd.DataFrame() write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(),information='Finding root tweet for each tweet') column = ['tweet_id', 'tweet_time', 'origin_tweet_id', 'origin_tweet_id'] count = 0 temp_count = 0 total_number = 0 for key in dataFrame_dict.keys(): total_number = total_number + len(dataFrame_dict[key]) file_dict = {} file_list = [] for key in dataFrame_dict.keys(): file = open(path_save_to + key,'wb') file_dict[key] = csv.writer(file) file_list.append(file) file_error = open(path_save_to + file_name_save_to,'wb') error_writer = csv.writer(file_error) file_list.append(file_error) key_number = 0 for key in dataFrame_dict.keys(): key_number = key_number + 1 tweet_dataFrame = dataFrame_dict[key] for index in tweet_dataFrame.index: tweet_id = tweet_dataFrame.tweet_id[index] tweet_time = tweet_dataFrame.tweet_time[index] origin_tweet_id, origin_tweet_time,depth= find_root_tweet(dataFrame_dict=dataFrame_dict,tweet_id=tweet_dataFrame.origin_tweet_id[index],depth=0) # if origin_tweet_id != None and tweet_id != origin_tweet_id: if origin_tweet_id != None and origin_tweet_id != False: # line = pd.DataFrame(data = [[tweet_id,tweet_time,origin_tweet_id, origin_tweet_time]],index = [index],columns=column) # lifecycle_dataFrame = lifecycle_dataFrame.append(line,ignore_index=False) file_dict[key].writerow([tweet_id,tweet_time,origin_tweet_id, origin_tweet_time]) print 'key_number:',key_number,'number:',count, 'total_number:', total_number, 'depth:',depth, tweet_id,tweet_time,origin_tweet_id, origin_tweet_time elif origin_tweet_id == False: error_writer.writerow([tweet_id, tweet_time, origin_tweet_id, origin_tweet_time]) print 'key_number:', key_number, 'Error!! number:', count, 'total_number:', total_number, 'depth:', depth, tweet_id, tweet_time, origin_tweet_id, origin_tweet_time count += 1 if count - temp_count >= 10000: write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(),information='key_number:'+ str(key_number) + 'Finding root tweet, total_number:'+str(total_number)+',finished_number:'+str(count) + ' Finding root tweet for each tweet') temp_count = count for file in file_list: file.close()
def run(self): user_id_list = list(set(list(self.node_dataFrame.user_id))) temp_number_i_j = 0 for index_first in range(0, len(user_id_list), 1): for index_second in range(0, len(user_id_list), 1): if index_first != index_second: self.number_i_j = self.number_i_j + 1 node_i = user_id_list[index_first] node_j = user_id_list[index_second] node_i_out_degree = self.node_dataFrame.out_degree[node_i] node_j_in_degree = self.node_dataFrame.in_degree[node_j] wether_edge = self.wether_interaction_between_nodes( node_i=node_i, node_j=node_j, edge_dataFrame_idct=self.edge_dataFrame, hash_size=100, cohesion_type=self.cohesion_type) self.cohesion = self.cohesion + ( wether_edge - float(node_i_out_degree) * node_j_in_degree / self.number_of_edges) / self.number_of_edges if wether_edge == 1.0: self.number_i_j_edge = self.number_i_j_edge + 1 print '\n' + '*' * 140 print 'community id:', self.community_id, '; node_i:', node_i, '; node_j:', node_j, '; cohesion:', self.cohesion, '; index first:', index_first, '; index_second:', index_second print 'node i out_degree:', node_i_out_degree, '; node j in_degree:', node_j_in_degree, '; wether exits edge:', wether_edge, ': number_i_j_edge:', self.number_i_j_edge, '; number_i_j:', self.number_i_j else: print '\n' + '*' * 140 print 'community id:', self.community_id, '; node_i:', node_i, '; node_j:', node_j, '; cohesion:', self.cohesion, '; index first:', index_first, '; index_second:', index_second print 'node i out_degree:', node_i_out_degree, '; node j in_degree:', node_j_in_degree, '; wether exits edge:', wether_edge, ': number_i_j_edge:', self.number_i_j_edge, '; number_i_j:', self.number_i_j if self.number_i_j - temp_number_i_j >= 100000: temp_number_i_j = self.number_i_j write_log( log_file_name='network_cohesion_whole_thread.log', log_file_path=os.getcwd(), information='Calculating cohesion. Cohesion:' + str(self.cohesion) + '; community_id:' + str(self.community_id) + ' Number_i_j:' + str(self.number_i_j)) else: pass self.lock.acquire() write_log(log_file_name='network_cohesion_whole_thread.log', log_file_path=os.getcwd(), information='################ Community_id: ' + str(self.community_id) + ' is over. cohesion is: ' + str(self.cohesion) + '#####################') self.lock.release()
def merge_tweet_dataFrame(tweet_dataFrame_dict): """ merge all tweet dataFrame into one dataFrame. :param tweet_dataFrame_list: the list that having all tweet-dataFrame. :return: the total dataFrame. """ tweet_dataFrame = pd.DataFrame() index = 0 for key in tweet_dataFrame_dict.keys(): index = index + 1 print index, 'MERGERING DATAFRAME...', index, 'dataFrame is being merge...' write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information=str(index) + ': MERGERING DATAFRAME... ' + str(index) + ' dataFrame is being merge...') tweet_dataFrame = tweet_dataFrame.append(tweet_dataFrame_dict[key]) return tweet_dataFrame
def find_user(path_data, path_save_to, file_save_to): file_name_list = get_dirlist(path=path_data, key_word_list=['part-r', '.json', '33b34d49'], no_key_word_list=['crc']) print len(file_name_list) time.sleep(40) file_save = open(path_save_to + file_save_to, 'wb') file_writer = csv.writer(file_save) print file_name_list file_index = 0 for file_name in file_name_list: file_index = file_index + 1 file = open(path_data + file_name, 'r') write_log(log_file_name='find_verified_user.log', log_file_path=os.getcwd(), information='file index:' + str(file_index) + ' is being processing.') for line in file: try: print len(line) row = json.loads(line, object_pairs_hook=OrderedDict) actor = [ row['actor']['id'], row['actor']['verified'], row['actor']['preferredUsername'] ] file_writer.writerow(actor) print 'file index:', file_index, actor if row['type'] == 'retweet': origin_actor = [ row['originActor']['id'], row['originActor']['verified'], row['originActor']['preferredUsername'] ] file_writer.writerow(origin_actor) else: pass except: print file_index, '*' * 100 pass file.close() file_save.close()
def calculate_lifecycle_for_each_tweet(tweet_dataFrame_dict, file_save_to_name, path_save_to): """ calculate lifecycle for each tweet in dataFrame acrooding to end time and start time. :param tweet_dataFrmae_list: the list that having all tweet-dataFrame. :param file_save_to_name: the file that result of each tweet that saved to. :param path_save_to: the path of the file_save_to_name :return: updated tweet_dataFrame """ file_save = open(path_save_to + file_save_to_name, 'wb') dataFrame_index = 0 for key in tweet_dataFrame_dict.keys(): dataFrame_index = dataFrame_index + 1 print dataFrame_index, ': CALCULATING LIFECYCLE...', dataFrame_index, 'dataFrame is being calculated......' write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information=str(dataFrame_index) + ': CALCULATING LIFECYCLE... ' + str(dataFrame_index) + ':dataFrame is being calculated......') tweet_dataFrame_dict[key]['lifecycle'] = 0 for tweet_id in tweet_dataFrame_dict[key].index: start_time = tweet_dataFrame_dict[key].start_time[tweet_id] end_time = tweet_dataFrame_dict[key].end_time[tweet_id] tweet_dataFrame_dict[key].loc[ [tweet_id], ['lifecycle']] = time.mktime( time.strptime(end_time, '%Y-%m-%dT%H:%M:%S.000Z')) - time.mktime( time.strptime(start_time, '%Y-%m-%dT%H:%M:%S.000Z')) tweet_dict = dict(tweet_dataFrame_dict[key].loc[tweet_id]) tweet_dict['file_name'] = key line = json.dumps(tweet_dict) + '\n' # print index, 'CALCULATING LIFECYCLE...', index, 'were calculated and writen to file' file_save.write(line) file_save.close() return tweet_dataFrame_dict
def build_tweet_dataFrame_dict(file_path): """ build tweet_dataFrame_list, that a dataFrame contains the data of a file, and all dataFrames put into one list. :param file_path: the path of directory that files in. :return: the tweet_dataFrame_list, tweet_dataFrame_index_list, the number of Dubai's actors. """ tweet_dataFrame_dict = {} file_name_list = get_dirlist(file_path) for file_name in file_name_list: tweet_dataFrame_dict[file_name] = list() dubai_actor_dict = get_dubai_actor_dict(file_name_list=file_name_list, file_path=file_path) index = 0 for file_name in file_name_list: index = index + 1 print index, ': BUILDING TWEET DATAFRAME according to file:', index, file_name write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information=str(index) + ': BUILDING TWEET DATAFRAME according to file: ' + str(file_name)) tweet_dataFrame = build_tweet_dataFrame(file_name=file_name, file_path=file_path) tweet_dataFrame_dict[file_name] = tweet_dataFrame tweet_dataFrame_dict = add_origin_tweet_to_dataFrame( file_name_list=file_name_list, file_path=file_path, tweet_dataFrame_dict=tweet_dataFrame_dict, actor_dict=dubai_actor_dict) actor_number = 0 for key in dubai_actor_dict.keys(): actor_number = actor_number + len(dubai_actor_dict[key]) return tweet_dataFrame_dict, actor_number
def update_tweet(file_path, tweet_dataFrame_dict): """ update the info of each tweet in the dataFrame accroedig to other tweets. :param file_path: The path of directory that all files in. :param tweet_dataFrame_list: the list containing the tweet-dataFrame :param tweet_dataFrame_index_list: the list containing the index of each dataFrame in tweet_dataFrame_list. :return: updated tweet_dataFrame_list. """ tweet_dataFrame_index_dict = get_tweet_dataFrame_index_dict( tweet_dataFrame_dict) file_name_list = get_dirlist(file_path) file_index = 0 for file_name in file_name_list: file_index = file_index + 1 print file_index, 'UPDATING INFO OF TWEET...', file_name, 'is processing......' write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information=str(file_index) + ': UPDATING INFO OF TWEET...' + str(file_name) + 'is being processed......') data_file = open(file_path + file_name, 'r') index = 0 for line in data_file: index += 1 row = json.loads(line) tweet_body = row['tweet']['body'] # 'reply' type, update info of tweet that the reply reply to. if row['type'] == 'reply': tweet_id = "00" + row['tweet']['inReplyTo'] tweet_index = whether_in_dict( str=tweet_id, dictionary=tweet_dataFrame_index_dict) if tweet_index != None: temp_time = compare_time( origin_time=tweet_dataFrame_dict[tweet_index]. end_time[tweet_id], new_time=row['tweet']['postedTime']) tweet_dataFrame_dict[tweet_index].loc[ [tweet_id], ['end_time']] = temp_time tweet_dataFrame_dict[tweet_index].loc[[tweet_id], ['reply_count']] += 1 # print index, 'PROCESSING TWEET... tweet type:', row[ 'type'], 'inReplyTo in the dataFrame and update "reply_count and end_time', '00' + row['tweet']['inReplyTo'] else: pass # 'tweet' type. # the condition that the user retweet someone's tweet and attached his own words: update info of the tweet that be retweeted if it is included in dataFrame. # the condition that a user posts a new tweet just contains his own origin content: do nothing. elif row[ 'type'] == 'tweet' and '://twitter.com/' in tweet_body and '/status/' in tweet_body: tweet_body_content_list = tweet_body.split('://twitter.com/') tweet_id_content = [ content.split('/status/')[1] for content in tweet_body_content_list if '/status/' in content ][0] tweet_id = '00' + tweet_id_content[:18] tweet_index = whether_in_dict( str=tweet_id, dictionary=tweet_dataFrame_index_dict) if tweet_index != None: temp_time = compare_time( origin_time=tweet_dataFrame_dict[tweet_index]. end_time[tweet_id], new_time=row['tweet']['postedTime']) tweet_dataFrame_dict[tweet_index].loc[ [tweet_id], ['end_time']] = temp_time tweet_dataFrame_dict[tweet_index].loc[ [tweet_id], ['retweet_count']] += 1 # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'update "end_time and retweet_count" of tweet:', tweet_id else: # print index , 'PROCESSING TWEET... tweet type:', row['type'], 'tweet:', tweet_id,'not in the dataFrame' pass # 'retwet' type elif row['type'] == 'retweet': origin_tweet_id = row['originTweet']['id'] origin_tweet_index = whether_in_dict( str=origin_tweet_id, dictionary=tweet_dataFrame_index_dict) if origin_tweet_index != None: temp_time = compare_time( origin_time=tweet_dataFrame_dict[origin_tweet_index]. end_time[origin_tweet_id], new_time=row['tweet']['postedTime']) tweet_dataFrame_dict[origin_tweet_index].loc[ [origin_tweet_id], ['end_time']] = temp_time tweet_dataFrame_dict[origin_tweet_index].loc[ [origin_tweet_id], ['retweet_count']] += 1 # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'originweet in the dataFrame and update "end_time and retweet_count" of tweet:', tweet_id else: # print index , 'PROCESSING TWEET... tweet type:', row['type'], 'originTweet not in the dataFrame' pass if '://twitter.com/' in tweet_body and '/status/' in tweet_body: tweet_body_content_list = tweet_body.split( '://twitter.com/') tweet_id_content = [ content.split('/status/')[1] for content in tweet_body_content_list if '/status/' in content ][0] tweet_id = '00' + tweet_id_content[:18] tweet_index = whether_in_dict( str=tweet_id, dictionary=tweet_dataFrame_index_dict) if tweet_index != None: temp_time = compare_time( origin_time=tweet_dataFrame_dict[tweet_index]. end_time[tweet_id], new_time=row['tweet']['postedTime']) tweet_dataFrame_dict[tweet_index].loc[ [tweet_id], ['end_time']] = temp_time tweet_dataFrame_dict[tweet_index].loc[ [tweet_id], ['retweet_count']] += 1 # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'body has twitter url, and updata "end_time and retweet_count" of tweet:', tweet_id else: # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'body has twitter url, but not in the dataFrmae ' pass data_file.close() return tweet_dataFrame_dict
def calculate_lifecycle(): """ calculate lifecycle for each tweet. :return: Nothing to return. """ path_data = 'D:/LiuQL/eHealth/twitter/data/data_dubai/' path_save_to = 'D:/LiuQL/eHealth/twitter/data/' # path_data = '/pegasus/harir/Qianlong/data/data_dubai/' # path_save_to = '/pegasus/harir/Qianlong/data/' file_save_to_name = 'tweet_lifecycle_apart.json' # path_data = raw_input('Please input the FILES which contain the data:') # path_save_to = raw_input('Please input the path of directory where you want the RESULT FILE saves to:') # file_save_to_name = raw_input('Please input the file name that you want the result saved to (eg:result.json):') write_log( log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information= '########################resatrt the program of Claculating lifecycle.########################' ) #calculate lifecycle for each tewwt. start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print 'the dataFrame of tweet is being building......,,please wait for a moment.' tweet_dataFrame_dict, actor_number = build_tweet_dataFrame_dict( file_path=path_data) write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information='tweet_dataFrame_dict is being saving to file.') pandas_dataFrame_to_file(operation_type='tweet_dataFrame', file_path=path_save_to, dataFrame_dict=tweet_dataFrame_dict) print 'updating the "end time, retweet count, reply count" of each tweet.....,please wait for a moment.' tweet_dataFrame_dict = update_tweet( file_path=path_data, tweet_dataFrame_dict=tweet_dataFrame_dict) write_log( log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information='updated tweet_dataFrame_dict is being saving to file') pandas_dataFrame_to_file(operation_type='updated_tweet_dataFrame', file_path=path_save_to, dataFrame_dict=tweet_dataFrame_dict) print 'claculating the lifecycle of each tweet......,please wait for a moment.' tweet_dataFrame_dict = calculate_lifecycle_for_each_tweet( tweet_dataFrame_dict=tweet_dataFrame_dict, file_save_to_name=file_save_to_name, path_save_to=path_save_to) tweet_dataFrame = merge_tweet_dataFrame( tweet_dataFrame_dict=tweet_dataFrame_dict) # delete variables that not be used for longer del tweet_dataFrame_dict #output the result. describe_dataFrame = tweet_dataFrame.describe() print '=================================================================\ndescribe of the result' print describe_dataFrame print '=================================================================\nlifecycle > 0:' print tweet_dataFrame[tweet_dataFrame['lifecycle'] > 0] end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print '=================================================================' print 'start_time:', start_time print 'end_time:', end_time print "total number of tweets:", str(len(tweet_dataFrame.index)) print "total number of Dubai's actor:", actor_number print "total number of tweets that been replied:" + str( len(tweet_dataFrame[tweet_dataFrame['reply_count'] > 0].index)) print "total number of tweets that been retweeded:" + str( len(tweet_dataFrame[tweet_dataFrame['retweet_count'] > 0].index)) print "average reply count:", str(describe_dataFrame.reply_count['mean']) print "average retweet count:", str( describe_dataFrame.retweet_count['mean']) print "average lifecycle of tweets (seconds):", str( describe_dataFrame.lifecycle['mean']) print '=================================================================' # save the result into file. info_file = open( os.getcwd().replace('process', '') + 'calculate_lifecycle_info_apart.txt', 'wb') info_file.write("start time:" + str(start_time) + '\n') info_file.write("end time:" + str(end_time) + '\n') info_file.write("total number of tweets:" + str(len(tweet_dataFrame.index)) + '\n') info_file.write("total number of Dubai's actor:" + str(actor_number) + '\n') info_file.write( "total number of tweets that been replied:" + str(len(tweet_dataFrame[tweet_dataFrame['reply_count'] > 0].index)) + '\n') info_file.write( "total number of tweets that been retweeded:" + str(len(tweet_dataFrame[tweet_dataFrame['retweet_count'] > 0].index)) + '\n') info_file.write("average reply count:" + str(describe_dataFrame.reply_count['mean']) + '\n') info_file.write("average retweet count:" + str(describe_dataFrame.retweet_count['mean']) + '\n') info_file.write("average lifecycle of tweets:" + str(describe_dataFrame.lifecycle['mean']) + ' seconds\n') info_file.close() # write the result into log file. write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information="start time:" + str(start_time)) write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information="end time:" + str(end_time)) write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information="total number of tweets:" + str(len(tweet_dataFrame.index))) write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information="total number of Dubai's actor:" + str(actor_number)) write_log( log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information="total number of tweets that been replied:" + str(len(tweet_dataFrame[tweet_dataFrame['reply_count'] > 0].index))) write_log( log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information="total number of tweets that been retweeded:" + str(len(tweet_dataFrame[tweet_dataFrame['retweet_count'] > 0].index))) write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information="average reply count:" + str(describe_dataFrame.reply_count['mean'])) write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information="average retweet count:" + str(describe_dataFrame.retweet_count['mean'])) write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information="average lifecycle of tweets:" + str(describe_dataFrame.lifecycle['mean']) + ' seconds') print '##############the result has been saved in:', os.getcwd().replace( 'process', '') + 'calculate_lifecycle_info_apart.txt' write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information='The result has been saved in:' + os.getcwd().replace('process', '') + 'calculate_lifecycle_info_apart.txt') write_log( log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information= '************************ Successfully calculated the lifecycle for tweet.*********************\n' + '*' * 100 + '\n' + '*' * 100 + '\n' + '*' * 100 + '\n\n')