Пример #1
0
def get_all_dataFrame(data_path):
    """
    根据所有的hash之后的文件进行为每一个文件构建一个dataFrame,最后所有的dataFrame放入一个dict中,key名即为文件的名称
    :param data_path: hash文件存储的路径。
    :return:保存dataFrame的dict
    """
    file_name_list = get_dirlist(path=data_path, key_word_list=['hash_qianlong'])
    dataFrame_dict = {}
    index = 0
    for file_name in file_name_list:
        index += 1
        write_log(log_file_name='find_root_tweet.log',log_file_path=os.getcwd(),information=str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...')
        print time.ctime(), str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...'
        data = pd.read_csv(data_path + file_name, header = None)
        data.columns = ['tweet_id', 'origin_tweet_id', 'from_user','from_user_id','to_user','to_user_id', 'tweet_time', 'origin_tweet_time', 'type']
        data = data[data.origin_tweet_time != 'null']
        data = data[data.type != 'mention']
        del data['from_user']
        del data['from_user_id']
        del data['to_user']
        del data['to_user_id']
        data.index = data.tweet_id
        dataFrame_dict[file_name] = data
    write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(),information='tweet_dataFrame has been built, total number:')

    return dataFrame_dict
Пример #2
0
def get_dubai_actor_dict(file_name_list, file_path):
    """
    put all twitter's actors of Dubai into one list, which was used to judge the originTweet whether is a tweet of Dubai.
    :param file_name_list: the list of all file names.
    :param file_path: the path of directory where all files are saved.
    :return: the list of all Dubai's actors.
    """
    actor_dict = {}
    index = 0
    for file_name in file_name_list:
        actor_dict[file_name] = list()

    for file_name in file_name_list:
        index = index + 1
        actor_list = []
        write_log(log_file_name='calculate_lifecycle_own_apart.log',
                  log_file_path=os.getcwd(),
                  information=str(index) +
                  ': BUILDING actor list of Dubai, file:' + file_name +
                  ' is being processing...')
        print str(
            index
        ), ': BUILDING actor dict of Dubai, file:' + file_name + ' is being processing...'
        file = open(file_path + file_name, 'r')
        for line in file:
            row = json.loads(line)
            if row['actor']['id'] not in actor_list:
                actor_list.append(row['actor']['id'])
            else:
                pass
        file.close()
        actor_dict[file_name] = actor_list
    return actor_dict
def main():
    file_path = 'D:/LiuQL/eHealth/twitter/visualization/node_edge/'
    node_file_name = 'labeled_community_nodes_2000.csv'
    cohesion_type = 'hash'
    edge_file_name = 'community_edges_2000.csv'
    edge_file_path = 'D:/test/edge_hash/'
    edge_file_key_word_list = ['edge_hash_', '.csv']

    # Dubai server
    # file_path ='/pegasus/harir/Qianlong/data/network/'
    # node_file_name ='total_node_degree.csv'
    # cohesion_type = 'hash'
    # edge_file_name = 'total_edge_weight.csv'
    # edge_file_path = '/pegasus/harir/Qianlong/data/network/edge_hash/'
    # edge_file_key_word_list = ['edge_hash_','.csv']

    write_log(log_file_name='network_cohesion_community_thread.log', log_file_path=os.getcwd(),
              information='*' * 50 + 'Starting Calculate cohesion' + '*' * 50)
    cohesion_file = 'cohesion_community_thread.json'
    file = open(os.getcwd() + '/' + cohesion_file, 'wb')
    file.close()
    calculate_cohesion_whole_network(file_path=file_path, node_file_name=node_file_name,
                                                 cohesion_type=cohesion_type, cohesion_file = cohesion_file,edge_file_name=None,
                                                 edge_file_path=edge_file_path,
                                                 edge_file_key_word_list=edge_file_key_word_list)
    write_log(log_file_name='network_cohesion_community_thread.log', log_file_path=os.getcwd(),
              information='*' * 20 + 'Program Done' + '*' * 20 + '\n' * 4)
Пример #4
0
def main():
    write_log(
        log_file_name='find_verified_user.log',
        log_file_path=os.getcwd(),
        information='###################program start#####################.')
    path_data = 'D:/LiuQL/eHealth/twitter/data/data_origin/'
    path_save_to = 'D:/LiuQL/eHealth/twitter/data/data_origin/'
    # path_data = '/pegasus/twitter-p-or-t-uae-201603.json.dxb/'
    # path_save_to ='/pegasus/harir/Qianlong/data/network/'

    duplicate_user_file = 'user_contain_duplicates.txt'
    all_user_file = 'user_all.txt'
    verified_user_file = 'user_verified.txt'
    find_user(path_data=path_data,
              path_save_to=path_save_to,
              file_save_to=duplicate_user_file)
    drop_duplicate_user(path_data=path_save_to,
                        path_save_to=path_save_to,
                        actor_file=duplicate_user_file,
                        all_user_file=all_user_file,
                        verified_user_file=verified_user_file)
    write_log(
        log_file_name='find_verified_user.log',
        log_file_path=os.getcwd(),
        information='###################program finished#####################.'
        + '\n' * 5)
    def run(self):
        if self.number_of_edges == 0:
            self.get_community_edges()
            self.calculate_degree()
        else:
            pass
        user_id_list = list(set(list(self.node_dataFrame.user_id)))
        temp_number_i_j = 0
        for index_first in range(0, len(user_id_list), 1):
            for index_second in range(0, len(user_id_list), 1):
                if index_first != index_second:
                    self.number_i_j = self.number_i_j + 1
                    node_i = user_id_list[index_first]
                    node_j = user_id_list[index_second]
                    node_i_out_degree = self.node_dataFrame.out_degree[node_i]
                    node_j_in_degree = self.node_dataFrame.in_degree[node_j]
                    wether_edge = self.wether_interaction_between_nodes(node_i=node_i, node_j=node_j)
                    self.cohesion = self.cohesion + (wether_edge - float(
                        node_i_out_degree) * node_j_in_degree / self.number_of_edges) / self.number_of_edges
                    if wether_edge == 1.0:
                        self.number_i_j_edge = self.number_i_j_edge + 1
                        print '\n' + '*' * 140
                        print 'community id:', self.community_id, '; node_i:', node_i, '; node_j:', node_j, '; cohesion:', self.cohesion, '; index first:', index_first, '; index_second:', index_second
                        print 'node i out_degree:', node_i_out_degree, '; node j in_degree:', node_j_in_degree, '; wether exits edge:', wether_edge, ': number_i_j_edge:', self.number_i_j_edge, '; number_i_j:', self.number_i_j
                    else:
                        print '\n' + '*' * 140
                        print 'community id:', self.community_id, '; node_i:', node_i, '; node_j:', node_j, '; cohesion:', self.cohesion, '; index first:', index_first, '; index_second:', index_second
                        print 'node i out_degree:', node_i_out_degree, '; node j in_degree:', node_j_in_degree, '; wether exits edge:', wether_edge, ': number_i_j_edge:', self.number_i_j_edge, '; number_i_j:', self.number_i_j
                    if self.number_i_j - temp_number_i_j >= 100000:
                        temp_number_i_j = self.number_i_j
                        write_log(log_file_name='network_cohesion_community_thread.log', log_file_path=os.getcwd(),
                                  information='Calculating cohesion. Cohesion:' + str(
                                      self.cohesion) + '; community_id:' + str(
                                      self.community_id) + ' Number_i_j:' + str(self.number_i_j))
                else:
                    pass
        self.lock.acquire()
        # write_log(log_file_name='network_cohesion_community_thread.log', log_file_path=os.getcwd(),information='################ Community_id: ' + str(self.community_id) + ' is over. cohesion is: ' + str(self.cohesion) + '#####################')
        cohesion_dict = {"community_id":self.community_id,"cohesion": self.cohesion, "number_of_edges": self.number_of_edges, "number_i_j": self.number_i_j, "number_i_j_edge": self.number_i_j_edge}
        cohesion_dict = dict(
            community_id = int(self.community_id),
            cohesion = self.cohesion,
            number_of_edges = int(self.number_of_edges),
            number_i_j = int(self.number_i_j),
            number_i_j_edge = int(self.number_i_j_edge)
        )
        write_log(log_file_name='network_cohesion_community_thread.log', log_file_path=os.getcwd(),information='################ Community_id: ' + str(self.community_id) + ' is over.' + str(cohesion_dict))

        print cohesion_dict
        row = json.dumps(cohesion_dict) + '\n'
        file = open(os.getcwd() + '/' + self.cohesion_file, 'a+')
        file.write(row)
        # file.write(str(cohesion_dict) + '\n')
        file.close()
        self.lock.release()
Пример #6
0
def add_origin_tweet_to_dataFrame(file_name_list, file_path,
                                  tweet_dataFrame_dict, actor_dict):
    """
    add originTweet to the tweet_dataFrame_list.
    :param file_name_list: the list of all files' name
    :param file_path: the path of the directory where all files are.
    :param tweet_dataFrame_list: tweet_dataFrame_list.
    :param tweet_dataFrame_index_list: index list.
    :param actor_list: the list of all Dubai's actors
    :return: updated tweet_dataFrame_list, updated tweetDataFrame_index_list
    """
    index = 0
    columns = [
        'tweet_id', 'start_time', 'end_time', 'reply_count', 'retweet_count'
    ]
    for file_name in file_name_list:
        index = index + 1
        write_log(log_file_name='calculate_lifecycle_own_apart.log',
                  log_file_path=os.getcwd(),
                  information=str(index) +
                  ': Adding originTweet to tweet_dataFrame file:' + file_name +
                  ' is being processing...')
        print str(
            index
        ), ': Adding originTweet to tweet_dataFrame file:' + file_name + ' is being processing...'
        file = open(file_path + file_name, 'r')

        for line in file:
            row = json.loads(line)
            if row['type'] == 'retweet':
                origin_actor_id = row['originActor']['id']
                origin_tweet_id = row['originTweet']['id']
                actor_index = whether_in_dict(type_dict='actor_dict',
                                              str=origin_actor_id,
                                              dictionary=actor_dict)
                tweet_index = whether_in_dict(
                    type_dict='index_dict',
                    str=origin_tweet_id,
                    dictionary=get_tweet_dataFrame_index_dict(
                        tweet_dataFrame_dict))
                if actor_index != None and tweet_index == None:
                    new_line = pd.DataFrame(data=[[
                        row['originTweet']['id'],
                        row['originTweet']['postedTime'],
                        row['originTweet']['postedTime'], 0.0, 1.0
                    ]],
                                            index=[row['originTweet']['id']],
                                            columns=columns)
                    tweet_dataFrame_dict[file_name] = tweet_dataFrame_dict[
                        file_name].append(new_line)
            else:
                pass
        file.close()
    return tweet_dataFrame_dict
Пример #7
0
def calculate_lifecycle():
    write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(),
              information='############################## start program ################################')
    data_path =  'D:/LiuQL/eHealth/twitter/data/data_hash/'
    path_save_to =  'D:/LiuQL/eHealth/twitter/data/data_hash/result/'
    path_save_to =  'D:/LiuQL/eHealth/twitter/data/'
    file_name_save_to = 'tweet_originTweet_error.csv'
    # data_path = '/pegasus/harir/Qianlong/data/hash/'
    # path_save_to = '/pegasus/harir/Qianlong/data/project_data/twitter_hash_dataFrame/'
    dataFrame_dict = get_all_dataFrame(data_path=data_path)
    print 'tweet_dataFrame has been built.'
    build_tweet(dataFrame_dict=dataFrame_dict,path_save_to=path_save_to, file_name_save_to=file_name_save_to)
    write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(),
              information='############################## program end ################################' + '\n' * 4)
Пример #8
0
def build_tweet(dataFrame_dict,path_save_to,file_name_save_to):
    """
    为每一条tweet找到其root tweet,并将结果保存成(tweet_id, tweet_time, root_tweet_id, root_tweet_time)格式。而且为dataFrame_dict中每一个dataFrame保存一个文件,文件名称和其key相同,也即和源文件相同。
    :param dataFrame_dict:包含初始文件数据的dataFrame_dict
    :param path_save_to:需要将结果保存的路径。
    :param file_name_save_to:出错数据保存的名称。
    :return:无返回内容。
    """
    # lifecycle_dataFrame = pd.DataFrame()
    write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(),information='Finding root tweet for each tweet')
    column = ['tweet_id', 'tweet_time', 'origin_tweet_id', 'origin_tweet_id']
    count = 0
    temp_count = 0
    total_number = 0
    for key in dataFrame_dict.keys():
        total_number = total_number + len(dataFrame_dict[key])
    file_dict = {}
    file_list = []
    for key in dataFrame_dict.keys():
        file = open(path_save_to + key,'wb')
        file_dict[key] = csv.writer(file)
        file_list.append(file)
    file_error = open(path_save_to + file_name_save_to,'wb')
    error_writer  = csv.writer(file_error)
    file_list.append(file_error)
    key_number = 0
    for key in dataFrame_dict.keys():
        key_number = key_number + 1
        tweet_dataFrame = dataFrame_dict[key]
        for index in tweet_dataFrame.index:
            tweet_id = tweet_dataFrame.tweet_id[index]
            tweet_time = tweet_dataFrame.tweet_time[index]
            origin_tweet_id, origin_tweet_time,depth= find_root_tweet(dataFrame_dict=dataFrame_dict,tweet_id=tweet_dataFrame.origin_tweet_id[index],depth=0)
            # if origin_tweet_id != None and tweet_id != origin_tweet_id:
            if origin_tweet_id != None and origin_tweet_id != False:
                # line = pd.DataFrame(data = [[tweet_id,tweet_time,origin_tweet_id, origin_tweet_time]],index = [index],columns=column)
                # lifecycle_dataFrame = lifecycle_dataFrame.append(line,ignore_index=False)
                file_dict[key].writerow([tweet_id,tweet_time,origin_tweet_id, origin_tweet_time])
                print 'key_number:',key_number,'number:',count, 'total_number:', total_number, 'depth:',depth, tweet_id,tweet_time,origin_tweet_id, origin_tweet_time
            elif origin_tweet_id == False:
                error_writer.writerow([tweet_id, tweet_time, origin_tweet_id, origin_tweet_time])
                print  'key_number:', key_number, 'Error!! number:', count, 'total_number:', total_number, 'depth:', depth, tweet_id, tweet_time, origin_tweet_id, origin_tweet_time
            count += 1
            if count - temp_count >= 10000:
                write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(),information='key_number:'+ str(key_number) + 'Finding root tweet, total_number:'+str(total_number)+',finished_number:'+str(count) + '   Finding root tweet for each tweet')
                temp_count = count
    for file in file_list:
        file.close()
Пример #9
0
 def run(self):
     user_id_list = list(set(list(self.node_dataFrame.user_id)))
     temp_number_i_j = 0
     for index_first in range(0, len(user_id_list), 1):
         for index_second in range(0, len(user_id_list), 1):
             if index_first != index_second:
                 self.number_i_j = self.number_i_j + 1
                 node_i = user_id_list[index_first]
                 node_j = user_id_list[index_second]
                 node_i_out_degree = self.node_dataFrame.out_degree[node_i]
                 node_j_in_degree = self.node_dataFrame.in_degree[node_j]
                 wether_edge = self.wether_interaction_between_nodes(
                     node_i=node_i,
                     node_j=node_j,
                     edge_dataFrame_idct=self.edge_dataFrame,
                     hash_size=100,
                     cohesion_type=self.cohesion_type)
                 self.cohesion = self.cohesion + (
                     wether_edge -
                     float(node_i_out_degree) * node_j_in_degree /
                     self.number_of_edges) / self.number_of_edges
                 if wether_edge == 1.0:
                     self.number_i_j_edge = self.number_i_j_edge + 1
                     print '\n' + '*' * 140
                     print 'community id:', self.community_id, '; node_i:', node_i, '; node_j:', node_j, '; cohesion:', self.cohesion, '; index first:', index_first, '; index_second:', index_second
                     print 'node i out_degree:', node_i_out_degree, '; node j in_degree:', node_j_in_degree, '; wether exits edge:', wether_edge, ': number_i_j_edge:', self.number_i_j_edge, '; number_i_j:', self.number_i_j
                 else:
                     print '\n' + '*' * 140
                     print 'community id:', self.community_id, '; node_i:', node_i, '; node_j:', node_j, '; cohesion:', self.cohesion, '; index first:', index_first, '; index_second:', index_second
                     print 'node i out_degree:', node_i_out_degree, '; node j in_degree:', node_j_in_degree, '; wether exits edge:', wether_edge, ': number_i_j_edge:', self.number_i_j_edge, '; number_i_j:', self.number_i_j
                 if self.number_i_j - temp_number_i_j >= 100000:
                     temp_number_i_j = self.number_i_j
                     write_log(
                         log_file_name='network_cohesion_whole_thread.log',
                         log_file_path=os.getcwd(),
                         information='Calculating cohesion. Cohesion:' +
                         str(self.cohesion) + '; community_id:' +
                         str(self.community_id) + ' Number_i_j:' +
                         str(self.number_i_j))
             else:
                 pass
     self.lock.acquire()
     write_log(log_file_name='network_cohesion_whole_thread.log',
               log_file_path=os.getcwd(),
               information='################ Community_id: ' +
               str(self.community_id) + ' is over. cohesion is: ' +
               str(self.cohesion) + '#####################')
     self.lock.release()
Пример #10
0
def merge_tweet_dataFrame(tweet_dataFrame_dict):
    """
    merge all tweet dataFrame into one dataFrame.
    :param tweet_dataFrame_list: the list that having all tweet-dataFrame.
    :return: the total dataFrame.
    """
    tweet_dataFrame = pd.DataFrame()
    index = 0
    for key in tweet_dataFrame_dict.keys():
        index = index + 1
        print index, 'MERGERING DATAFRAME...', index, 'dataFrame is being merge...'
        write_log(log_file_name='calculate_lifecycle_own_apart.log',
                  log_file_path=os.getcwd(),
                  information=str(index) + ': MERGERING DATAFRAME... ' +
                  str(index) + ' dataFrame is being merge...')
        tweet_dataFrame = tweet_dataFrame.append(tweet_dataFrame_dict[key])
    return tweet_dataFrame
Пример #11
0
def find_user(path_data, path_save_to, file_save_to):
    file_name_list = get_dirlist(path=path_data,
                                 key_word_list=['part-r', '.json', '33b34d49'],
                                 no_key_word_list=['crc'])
    print len(file_name_list)
    time.sleep(40)
    file_save = open(path_save_to + file_save_to, 'wb')
    file_writer = csv.writer(file_save)
    print file_name_list
    file_index = 0
    for file_name in file_name_list:
        file_index = file_index + 1
        file = open(path_data + file_name, 'r')
        write_log(log_file_name='find_verified_user.log',
                  log_file_path=os.getcwd(),
                  information='file index:' + str(file_index) +
                  ' is being processing.')
        for line in file:
            try:
                print len(line)
                row = json.loads(line, object_pairs_hook=OrderedDict)
                actor = [
                    row['actor']['id'], row['actor']['verified'],
                    row['actor']['preferredUsername']
                ]
                file_writer.writerow(actor)
                print 'file index:', file_index, actor
                if row['type'] == 'retweet':
                    origin_actor = [
                        row['originActor']['id'],
                        row['originActor']['verified'],
                        row['originActor']['preferredUsername']
                    ]
                    file_writer.writerow(origin_actor)
                else:
                    pass
            except:
                print file_index, '*' * 100
                pass
        file.close()
    file_save.close()
Пример #12
0
def calculate_lifecycle_for_each_tweet(tweet_dataFrame_dict, file_save_to_name,
                                       path_save_to):
    """
    calculate lifecycle for each tweet in dataFrame acrooding to end time and start time.
    :param tweet_dataFrmae_list: the list that having all tweet-dataFrame.
    :param file_save_to_name: the file that result of each tweet that saved to.
    :param path_save_to: the path of the file_save_to_name
    :return: updated tweet_dataFrame
    """
    file_save = open(path_save_to + file_save_to_name, 'wb')
    dataFrame_index = 0
    for key in tweet_dataFrame_dict.keys():
        dataFrame_index = dataFrame_index + 1
        print dataFrame_index, ': CALCULATING LIFECYCLE...', dataFrame_index, 'dataFrame is being calculated......'
        write_log(log_file_name='calculate_lifecycle_own_apart.log',
                  log_file_path=os.getcwd(),
                  information=str(dataFrame_index) +
                  ': CALCULATING LIFECYCLE...   ' + str(dataFrame_index) +
                  ':dataFrame is being calculated......')
        tweet_dataFrame_dict[key]['lifecycle'] = 0
        for tweet_id in tweet_dataFrame_dict[key].index:
            start_time = tweet_dataFrame_dict[key].start_time[tweet_id]
            end_time = tweet_dataFrame_dict[key].end_time[tweet_id]
            tweet_dataFrame_dict[key].loc[
                [tweet_id], ['lifecycle']] = time.mktime(
                    time.strptime(end_time,
                                  '%Y-%m-%dT%H:%M:%S.000Z')) - time.mktime(
                                      time.strptime(start_time,
                                                    '%Y-%m-%dT%H:%M:%S.000Z'))
            tweet_dict = dict(tweet_dataFrame_dict[key].loc[tweet_id])
            tweet_dict['file_name'] = key
            line = json.dumps(tweet_dict) + '\n'
            # print index, 'CALCULATING LIFECYCLE...', index, 'were calculated and writen to file'
            file_save.write(line)
    file_save.close()
    return tweet_dataFrame_dict
Пример #13
0
def build_tweet_dataFrame_dict(file_path):
    """
    build tweet_dataFrame_list, that a dataFrame contains the data of a file, and all dataFrames put into one list.
    :param file_path: the path of directory that files in.
    :return: the tweet_dataFrame_list, tweet_dataFrame_index_list, the number of Dubai's actors.
    """
    tweet_dataFrame_dict = {}
    file_name_list = get_dirlist(file_path)
    for file_name in file_name_list:
        tweet_dataFrame_dict[file_name] = list()

    dubai_actor_dict = get_dubai_actor_dict(file_name_list=file_name_list,
                                            file_path=file_path)
    index = 0
    for file_name in file_name_list:
        index = index + 1
        print index, ': BUILDING TWEET DATAFRAME according to file:', index, file_name
        write_log(log_file_name='calculate_lifecycle_own_apart.log',
                  log_file_path=os.getcwd(),
                  information=str(index) +
                  ': BUILDING TWEET DATAFRAME according to file: ' +
                  str(file_name))
        tweet_dataFrame = build_tweet_dataFrame(file_name=file_name,
                                                file_path=file_path)
        tweet_dataFrame_dict[file_name] = tweet_dataFrame

    tweet_dataFrame_dict = add_origin_tweet_to_dataFrame(
        file_name_list=file_name_list,
        file_path=file_path,
        tweet_dataFrame_dict=tweet_dataFrame_dict,
        actor_dict=dubai_actor_dict)

    actor_number = 0
    for key in dubai_actor_dict.keys():
        actor_number = actor_number + len(dubai_actor_dict[key])
    return tweet_dataFrame_dict, actor_number
Пример #14
0
def update_tweet(file_path, tweet_dataFrame_dict):
    """
    update the info of each tweet in the dataFrame accroedig to other tweets.
    :param file_path: The path of directory that all files in.
    :param tweet_dataFrame_list: the list containing the tweet-dataFrame
    :param tweet_dataFrame_index_list: the list containing the index of each dataFrame in tweet_dataFrame_list.
    :return: updated tweet_dataFrame_list.
    """
    tweet_dataFrame_index_dict = get_tweet_dataFrame_index_dict(
        tweet_dataFrame_dict)
    file_name_list = get_dirlist(file_path)
    file_index = 0
    for file_name in file_name_list:
        file_index = file_index + 1
        print file_index, 'UPDATING INFO OF TWEET...', file_name, 'is processing......'
        write_log(log_file_name='calculate_lifecycle_own_apart.log',
                  log_file_path=os.getcwd(),
                  information=str(file_index) + ': UPDATING INFO OF TWEET...' +
                  str(file_name) + 'is being processed......')
        data_file = open(file_path + file_name, 'r')
        index = 0
        for line in data_file:
            index += 1
            row = json.loads(line)
            tweet_body = row['tweet']['body']

            # 'reply' type, update info of tweet that the reply reply to.
            if row['type'] == 'reply':
                tweet_id = "00" + row['tweet']['inReplyTo']
                tweet_index = whether_in_dict(
                    str=tweet_id, dictionary=tweet_dataFrame_index_dict)
                if tweet_index != None:
                    temp_time = compare_time(
                        origin_time=tweet_dataFrame_dict[tweet_index].
                        end_time[tweet_id],
                        new_time=row['tweet']['postedTime'])
                    tweet_dataFrame_dict[tweet_index].loc[
                        [tweet_id], ['end_time']] = temp_time
                    tweet_dataFrame_dict[tweet_index].loc[[tweet_id],
                                                          ['reply_count']] += 1
                    # print index, 'PROCESSING TWEET... tweet type:', row[ 'type'], 'inReplyTo in the dataFrame and update "reply_count and end_time', '00' + row['tweet']['inReplyTo']
                else:
                    pass

            # 'tweet' type.
            # the condition that the user retweet someone's tweet and attached his own words: update info of the tweet that be retweeted if it is included in dataFrame.
            # the condition that a user posts a new tweet just contains his own origin content: do nothing.
            elif row[
                    'type'] == 'tweet' and '://twitter.com/' in tweet_body and '/status/' in tweet_body:
                tweet_body_content_list = tweet_body.split('://twitter.com/')
                tweet_id_content = [
                    content.split('/status/')[1]
                    for content in tweet_body_content_list
                    if '/status/' in content
                ][0]
                tweet_id = '00' + tweet_id_content[:18]
                tweet_index = whether_in_dict(
                    str=tweet_id, dictionary=tweet_dataFrame_index_dict)
                if tweet_index != None:
                    temp_time = compare_time(
                        origin_time=tweet_dataFrame_dict[tweet_index].
                        end_time[tweet_id],
                        new_time=row['tweet']['postedTime'])
                    tweet_dataFrame_dict[tweet_index].loc[
                        [tweet_id], ['end_time']] = temp_time
                    tweet_dataFrame_dict[tweet_index].loc[
                        [tweet_id], ['retweet_count']] += 1
                    # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'update "end_time and retweet_count" of tweet:', tweet_id
                else:
                    # print index , 'PROCESSING TWEET... tweet type:', row['type'], 'tweet:', tweet_id,'not in the dataFrame'
                    pass
            # 'retwet' type
            elif row['type'] == 'retweet':
                origin_tweet_id = row['originTweet']['id']
                origin_tweet_index = whether_in_dict(
                    str=origin_tweet_id, dictionary=tweet_dataFrame_index_dict)
                if origin_tweet_index != None:
                    temp_time = compare_time(
                        origin_time=tweet_dataFrame_dict[origin_tweet_index].
                        end_time[origin_tweet_id],
                        new_time=row['tweet']['postedTime'])
                    tweet_dataFrame_dict[origin_tweet_index].loc[
                        [origin_tweet_id], ['end_time']] = temp_time
                    tweet_dataFrame_dict[origin_tweet_index].loc[
                        [origin_tweet_id], ['retweet_count']] += 1
                    # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'originweet in the dataFrame and update "end_time and retweet_count" of tweet:', tweet_id
                else:
                    # print index , 'PROCESSING TWEET... tweet type:', row['type'], 'originTweet not in the dataFrame'
                    pass
                if '://twitter.com/' in tweet_body and '/status/' in tweet_body:
                    tweet_body_content_list = tweet_body.split(
                        '://twitter.com/')
                    tweet_id_content = [
                        content.split('/status/')[1]
                        for content in tweet_body_content_list
                        if '/status/' in content
                    ][0]
                    tweet_id = '00' + tweet_id_content[:18]
                    tweet_index = whether_in_dict(
                        str=tweet_id, dictionary=tweet_dataFrame_index_dict)
                    if tweet_index != None:
                        temp_time = compare_time(
                            origin_time=tweet_dataFrame_dict[tweet_index].
                            end_time[tweet_id],
                            new_time=row['tweet']['postedTime'])
                        tweet_dataFrame_dict[tweet_index].loc[
                            [tweet_id], ['end_time']] = temp_time
                        tweet_dataFrame_dict[tweet_index].loc[
                            [tweet_id], ['retweet_count']] += 1
                        # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'body has twitter url, and updata "end_time and retweet_count" of tweet:', tweet_id
                    else:
                        # print index,  'PROCESSING TWEET... tweet type:', row['type'], 'body has twitter url, but not in the dataFrmae '
                        pass

        data_file.close()
    return tweet_dataFrame_dict
Пример #15
0
def calculate_lifecycle():
    """
    calculate lifecycle for each tweet.
    :return: Nothing to return.
    """
    path_data = 'D:/LiuQL/eHealth/twitter/data/data_dubai/'
    path_save_to = 'D:/LiuQL/eHealth/twitter/data/'

    # path_data = '/pegasus/harir/Qianlong/data/data_dubai/'
    # path_save_to = '/pegasus/harir/Qianlong/data/'

    file_save_to_name = 'tweet_lifecycle_apart.json'

    # path_data = raw_input('Please input the FILES which contain the data:')
    # path_save_to = raw_input('Please input the path of directory where you want the RESULT FILE saves to:')
    # file_save_to_name = raw_input('Please input the file name that you want the result saved to (eg:result.json):')

    write_log(
        log_file_name='calculate_lifecycle_own_apart.log',
        log_file_path=os.getcwd(),
        information=
        '########################resatrt the program of Claculating lifecycle.########################'
    )

    #calculate lifecycle for each tewwt.
    start_time = time.strftime('%Y-%m-%d %H:%M:%S',
                               time.localtime(time.time()))
    print 'the dataFrame of tweet is being building......,,please wait for a moment.'
    tweet_dataFrame_dict, actor_number = build_tweet_dataFrame_dict(
        file_path=path_data)

    write_log(log_file_name='calculate_lifecycle_own_apart.log',
              log_file_path=os.getcwd(),
              information='tweet_dataFrame_dict is being saving to file.')
    pandas_dataFrame_to_file(operation_type='tweet_dataFrame',
                             file_path=path_save_to,
                             dataFrame_dict=tweet_dataFrame_dict)

    print 'updating the "end time, retweet count, reply count" of each tweet.....,please wait for a moment.'
    tweet_dataFrame_dict = update_tweet(
        file_path=path_data, tweet_dataFrame_dict=tweet_dataFrame_dict)

    write_log(
        log_file_name='calculate_lifecycle_own_apart.log',
        log_file_path=os.getcwd(),
        information='updated tweet_dataFrame_dict is being saving to file')
    pandas_dataFrame_to_file(operation_type='updated_tweet_dataFrame',
                             file_path=path_save_to,
                             dataFrame_dict=tweet_dataFrame_dict)

    print 'claculating the lifecycle of each tweet......,please wait for a moment.'
    tweet_dataFrame_dict = calculate_lifecycle_for_each_tweet(
        tweet_dataFrame_dict=tweet_dataFrame_dict,
        file_save_to_name=file_save_to_name,
        path_save_to=path_save_to)
    tweet_dataFrame = merge_tweet_dataFrame(
        tweet_dataFrame_dict=tweet_dataFrame_dict)

    # delete variables that not be used for longer
    del tweet_dataFrame_dict

    #output the result.
    describe_dataFrame = tweet_dataFrame.describe()
    print '=================================================================\ndescribe of the result'
    print describe_dataFrame
    print '=================================================================\nlifecycle > 0:'
    print tweet_dataFrame[tweet_dataFrame['lifecycle'] > 0]
    end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    print '================================================================='
    print 'start_time:', start_time
    print 'end_time:', end_time
    print "total number of tweets:", str(len(tweet_dataFrame.index))
    print "total number of Dubai's actor:", actor_number
    print "total number of tweets that been replied:" + str(
        len(tweet_dataFrame[tweet_dataFrame['reply_count'] > 0].index))
    print "total number of tweets that been retweeded:" + str(
        len(tweet_dataFrame[tweet_dataFrame['retweet_count'] > 0].index))
    print "average reply count:", str(describe_dataFrame.reply_count['mean'])
    print "average retweet count:", str(
        describe_dataFrame.retweet_count['mean'])
    print "average lifecycle of tweets (seconds):", str(
        describe_dataFrame.lifecycle['mean'])
    print '================================================================='

    # save the result into file.
    info_file = open(
        os.getcwd().replace('process', '') +
        'calculate_lifecycle_info_apart.txt', 'wb')
    info_file.write("start time:" + str(start_time) + '\n')
    info_file.write("end time:" + str(end_time) + '\n')
    info_file.write("total number of tweets:" +
                    str(len(tweet_dataFrame.index)) + '\n')
    info_file.write("total number of Dubai's actor:" + str(actor_number) +
                    '\n')
    info_file.write(
        "total number of tweets that been replied:" +
        str(len(tweet_dataFrame[tweet_dataFrame['reply_count'] > 0].index)) +
        '\n')
    info_file.write(
        "total number of tweets that been retweeded:" +
        str(len(tweet_dataFrame[tweet_dataFrame['retweet_count'] > 0].index)) +
        '\n')
    info_file.write("average reply count:" +
                    str(describe_dataFrame.reply_count['mean']) + '\n')
    info_file.write("average retweet count:" +
                    str(describe_dataFrame.retweet_count['mean']) + '\n')
    info_file.write("average lifecycle of tweets:" +
                    str(describe_dataFrame.lifecycle['mean']) + ' seconds\n')
    info_file.close()

    # write the result into log file.
    write_log(log_file_name='calculate_lifecycle_own_apart.log',
              log_file_path=os.getcwd(),
              information="start time:" + str(start_time))
    write_log(log_file_name='calculate_lifecycle_own_apart.log',
              log_file_path=os.getcwd(),
              information="end time:" + str(end_time))
    write_log(log_file_name='calculate_lifecycle_own_apart.log',
              log_file_path=os.getcwd(),
              information="total number of tweets:" +
              str(len(tweet_dataFrame.index)))
    write_log(log_file_name='calculate_lifecycle_own_apart.log',
              log_file_path=os.getcwd(),
              information="total number of Dubai's actor:" + str(actor_number))
    write_log(
        log_file_name='calculate_lifecycle_own_apart.log',
        log_file_path=os.getcwd(),
        information="total number of tweets that been replied:" +
        str(len(tweet_dataFrame[tweet_dataFrame['reply_count'] > 0].index)))
    write_log(
        log_file_name='calculate_lifecycle_own_apart.log',
        log_file_path=os.getcwd(),
        information="total number of tweets that been retweeded:" +
        str(len(tweet_dataFrame[tweet_dataFrame['retweet_count'] > 0].index)))
    write_log(log_file_name='calculate_lifecycle_own_apart.log',
              log_file_path=os.getcwd(),
              information="average reply count:" +
              str(describe_dataFrame.reply_count['mean']))
    write_log(log_file_name='calculate_lifecycle_own_apart.log',
              log_file_path=os.getcwd(),
              information="average retweet count:" +
              str(describe_dataFrame.retweet_count['mean']))
    write_log(log_file_name='calculate_lifecycle_own_apart.log',
              log_file_path=os.getcwd(),
              information="average lifecycle of tweets:" +
              str(describe_dataFrame.lifecycle['mean']) + ' seconds')

    print '##############the result has been saved in:', os.getcwd().replace(
        'process', '') + 'calculate_lifecycle_info_apart.txt'
    write_log(log_file_name='calculate_lifecycle_own_apart.log',
              log_file_path=os.getcwd(),
              information='The result has been saved in:' +
              os.getcwd().replace('process', '') +
              'calculate_lifecycle_info_apart.txt')
    write_log(
        log_file_name='calculate_lifecycle_own_apart.log',
        log_file_path=os.getcwd(),
        information=
        '************************ Successfully calculated the lifecycle for tweet.*********************\n'
        + '*' * 100 + '\n' + '*' * 100 + '\n' + '*' * 100 + '\n\n')