def get_all_dataFrame(data_path): """ 根据所有的文件构建一个dataFrame。文件格式为['tweet_id', 'origin_tweet_id', 'from_user','from_user_id','to_user','to_user_id', 'tweet_time', 'origin_tweet_time', 'type'] :param data_path:存储文件的位置。 :return:构建之后的dataFrame。 """ file_name_list = get_dirlist(path=data_path, key_word_list=['hash_qianlong']) dataFrame_list = [] tweet_id_dict = {} index = 0 for file_name in file_name_list: index += 1 print time.ctime(), str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...' data = pd.read_csv(data_path + file_name, header = None) data.columns = ['tweet_id', 'origin_tweet_id', 'from_user','from_user_id','to_user','to_user_id', 'tweet_time', 'origin_tweet_time', 'type'] del data['from_user'] del data['from_user_id'] del data['to_user'] del data['to_user_id'] data.index = data.tweet_id tweet_id_dict[file_name] = list(data.tweet_id) dataFrame_list.append(data) tweet_dataFrame = pd.concat(dataFrame_list, ignore_index = False) tweet_dataFrame.index = tweet_dataFrame.tweet_id print tweet_dataFrame return tweet_dataFrame
def read_csv(data_path): """ 根据(tweet_id, tweet_time, root_tweet_id, root_tweet_time)格式的文件构建dataFrame_dict,其中字典中每一个dataFrame的名称和文件名称相同,一个文件的数据存储在一个dataFrame中。 :param data_path:文件路径 :return:构建之后的dataFrame_dict. """ file_name_list = get_dirlist(path = data_path, key_word_list=['hash_qianlong']) dataFrame_dict = {} index = 0 for file_name in file_name_list: index += 1 # write_log(log_file_name='calculate_lifecycle.log',log_file_path=os.getcwd().replace('process',''),information=str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...') print time.ctime(), str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...' data = pd.read_csv(data_path + file_name, header = None) data.columns = ['tweet_id', 'tweet_time', 'origin_tweet_id', 'origin_tweet_time'] data = data[data.origin_tweet_time != 'null'] data['lifecycle'] = data.tweet_time.apply(time_timestamp) - data.origin_tweet_time.apply(time_timestamp) # data = data[data.lifecycle != 0.0] del data['tweet_id'] data.columns = ['end_time', 'tweet_id', 'start_time', 'lifecycle'] # print data data = data.drop_duplicates() dataFrame_dict[file_name] = data # write_log(log_file_name='calculate_lifecycle.log', log_file_path=os.getcwd(),information='tweet_dataFrame has been built, total number:') return dataFrame_dict
def get_all_dataFrame(data_path): """ 根据所有的hash之后的文件进行为每一个文件构建一个dataFrame,最后所有的dataFrame放入一个dict中,key名即为文件的名称 :param data_path: hash文件存储的路径。 :return:保存dataFrame的dict """ file_name_list = get_dirlist(path=data_path, key_word_list=['hash_qianlong']) dataFrame_dict = {} index = 0 for file_name in file_name_list: index += 1 write_log(log_file_name='find_root_tweet.log',log_file_path=os.getcwd(),information=str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...') print time.ctime(), str(index) + ': Reading file to dataFrame:' + file_name + ' is being reading...' data = pd.read_csv(data_path + file_name, header = None) data.columns = ['tweet_id', 'origin_tweet_id', 'from_user','from_user_id','to_user','to_user_id', 'tweet_time', 'origin_tweet_time', 'type'] data = data[data.origin_tweet_time != 'null'] data = data[data.type != 'mention'] del data['from_user'] del data['from_user_id'] del data['to_user'] del data['to_user_id'] data.index = data.tweet_id dataFrame_dict[file_name] = data write_log(log_file_name='find_root_tweet.log', log_file_path=os.getcwd(),information='tweet_dataFrame has been built, total number:') return dataFrame_dict
def calculate_cohesion_whole_network(file_path, node_file_name, cohesion_type, cohesion_file, edge_file_name=None, edge_file_path=None, edge_file_key_word_list=None): """ 计算网络的聚合度,这里讲将边文件进行hash处理,以便快速查询。对于点文件,因为不同社区的聚合度为0,所以这里按照社区进行。一个一个考虑。 这里分为整个网络和部分社区。整个网络需要将变文件hash存储。部分社区不需要。 :param file_path:节点文件所在的路径,如果是针对部分社区的,也是边文件所在的路径 :param node_file_name:节点文件名称,第一行是列名。无论整个网络还是部分社区,都是统一格式(user_id,community_id, out_degree,in_degree,degree) :param cohesion_type: 处理的类型,是针对整个网络(’whole‘),还是针对部分社区(’community‘)。如果是针对的部分社区,需要对应的社区文件。 :param edge_file_name:如果针对的是部分社区,该文件为边文件。且第一行为列名(source, target, number_of_interaction, weight) :param edge_file_path: 如果是针对整个网络,该路径为存储hash边文件的路径。 :param edge_file_key_word_list: 如果是针对整个网络的话, :return:返回计算的结果。 """ node_dataFrame = pd.read_csv(file_path + node_file_name, dtype={'user_id': np.str}) node_dataFrame.index = node_dataFrame.user_id if cohesion_type == 'hash': edge_file_list = get_dirlist(path=edge_file_path, key_word_list=edge_file_key_word_list) print len(edge_file_list) print edge_file_list # time.sleep(20) edge_dataFrame_dict = {} for edge_file_name in edge_file_list: number = int((edge_file_name.split('hash_')[-1]).split('.')[0]) edge_dataFrame_dict[number] = pd.read_csv(edge_file_path + edge_file_name, dtype={'source': np.str, 'target': np.str}) else: edge_dataFrame_dict = pd.read_csv(file_path + edge_file_name, header=0, dtype={'source': np.str, 'target': np.str}) lock = threading.Lock() community_id_list = list(set(list(node_dataFrame.community_id))) print 'number 0f community:', len(community_id_list) # time.sleep(10) thread_list = [] for community_id in community_id_list: community_node_dataFrame = node_dataFrame[node_dataFrame.community_id == community_id] thread = cohesionThread(edge_dataFrame=edge_dataFrame_dict, node_dataFrame=community_node_dataFrame, community_id=community_id,cohesion_file = cohesion_file,lock=lock) thread.start() thread_list.append(thread) # break for thread in thread_list: thread.join()
def classify_data(): path_data = raw_input( 'Please input the path of directory where the FILES NEEDED to be CLASSIFIED are:' ) path_save_to = raw_input( 'Please input the path of directory where you want the RESULT FILE saves to:' ) file_save_to = open(path_save_to + 'total_data.json', 'wb') file_name_list = get_dirlist(path=path_data, key_word_list=['part-', '.json']) index = 0 for file_name in file_name_list: index = index + 1 print index, file_name, 'is being classifing......' read_file(file_name=file_name, path_data=path_data, file_save_to=file_save_to) file_save_to.close() print 'The result file has been saved to: ', path_save_to + 'total_data.json'
def find_user(path_data, path_save_to, file_save_to): file_name_list = get_dirlist(path=path_data, key_word_list=['part-r', '.json', '33b34d49'], no_key_word_list=['crc']) print len(file_name_list) time.sleep(40) file_save = open(path_save_to + file_save_to, 'wb') file_writer = csv.writer(file_save) print file_name_list file_index = 0 for file_name in file_name_list: file_index = file_index + 1 file = open(path_data + file_name, 'r') write_log(log_file_name='find_verified_user.log', log_file_path=os.getcwd(), information='file index:' + str(file_index) + ' is being processing.') for line in file: try: print len(line) row = json.loads(line, object_pairs_hook=OrderedDict) actor = [ row['actor']['id'], row['actor']['verified'], row['actor']['preferredUsername'] ] file_writer.writerow(actor) print 'file index:', file_index, actor if row['type'] == 'retweet': origin_actor = [ row['originActor']['id'], row['originActor']['verified'], row['originActor']['preferredUsername'] ] file_writer.writerow(origin_actor) else: pass except: print file_index, '*' * 100 pass file.close() file_save.close()
def __init__(self, topic_words_file_path, font_path=None, key_word_list=list(), no_key_words_list=list(), max_topic_number=4): """ 用来初始化一个主题词云的实例 :param topic_words_file_path: 主题top词文件所在的目录。 :param font_path: 字体路径,默认为空,在windows系统上需要赋值。 :param key_word_list: 在目录中读取主题词文件的时候文件名所需要的关键词 :param no_key_words_list:在目录中读取主题词文件的时候文件名不包含的关键词 :param max_topic_number:一个社区最多画多少个主题的词云。 :return:Nothing to return。 """ self.topic_words_filename_list = get_dirlist( topic_words_file_path, key_word_list=key_word_list, no_key_word_list=no_key_words_list) self.topic_words_file_path = topic_words_file_path self.community_topics = {} #用来存储每一个社区的主题,只有但选择画某个社区的图的时候才会进行读取操作 self.community_file = { } #每一个社区对应的主题词文件,因为一个社区可能由多个文件,所以在进行保存某一个社区对应哪些文件。一遍画图时直接读取。 self.font_path = font_path #字体路径,有些文字需要制定字体,如阿拉伯语。 self.error_community_id_list = [] self.full_width_community_id_list = [] self.max_topic_number = max_topic_number community_id_list = [] for file_name in self.topic_words_filename_list: community_id = int(file_name.split('-')[1]) community_id_list.append(community_id) self.community_id_list = list( set(community_id_list)) #社区id的列表,所有社区的id 都在这里。 for community_id in community_id_list: self.community_file[community_id] = [] for file_name in self.topic_words_filename_list: community_id = int(file_name.split('-')[1]) self.community_file[community_id].append(file_name)
def filter_data(self, origin_file_path, reserved_data_save_to, filtered_data_save_to): self.reserved_data_save_to = reserved_data_save_to self.filtered_data_save_to = filtered_data_save_to self.origin_file_path = origin_file_path file_name_list = get_dirlist( origin_file_path, key_word_list=['part-r', '.json'], no_key_word_list=['crc']) # 获得原始json文件所在目录里面的所有文件名称 index = 0 start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) for file_name in file_name_list: index = index + 1 print index, file_name, 'is being parsing......' self.__read_json__(file_name) end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) parse_info_file = open(os.getcwd() + '/filter_data_info.log', 'wb') parse_info_file.write("start time:" + str(start_time) + '\n') parse_info_file.write("end time:" + str(end_time) + '\n') parse_info_file.write("total number of files that parsed:" + str(index) + '\n') parse_info_file.write("total number of Dubai tweet:" + str(self.dubai_data_number) + '\n') parse_info_file.write("total number of No Dubai tweet:" + str(self.no_dubai_data_number) + '\n') parse_info_file.close() print '=================================================================' print 'start_time:', start_time print 'end_time:', end_time print "total number of Dubai tweet:", self.dubai_data_number print "total number of No Dubai tweet:", self.no_dubai_data_number print '================================================================='
def build_tweet_dataFrame_dict(file_path): """ build tweet_dataFrame_list, that a dataFrame contains the data of a file, and all dataFrames put into one list. :param file_path: the path of directory that files in. :return: the tweet_dataFrame_list, tweet_dataFrame_index_list, the number of Dubai's actors. """ tweet_dataFrame_dict = {} file_name_list = get_dirlist(file_path) for file_name in file_name_list: tweet_dataFrame_dict[file_name] = list() dubai_actor_dict = get_dubai_actor_dict(file_name_list=file_name_list, file_path=file_path) index = 0 for file_name in file_name_list: index = index + 1 print index, ': BUILDING TWEET DATAFRAME according to file:', index, file_name write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information=str(index) + ': BUILDING TWEET DATAFRAME according to file: ' + str(file_name)) tweet_dataFrame = build_tweet_dataFrame(file_name=file_name, file_path=file_path) tweet_dataFrame_dict[file_name] = tweet_dataFrame tweet_dataFrame_dict = add_origin_tweet_to_dataFrame( file_name_list=file_name_list, file_path=file_path, tweet_dataFrame_dict=tweet_dataFrame_dict, actor_dict=dubai_actor_dict) actor_number = 0 for key in dubai_actor_dict.keys(): actor_number = actor_number + len(dubai_actor_dict[key]) return tweet_dataFrame_dict, actor_number
header=None, names=names, dtype={ 'user_id': np.str, 'community': np.int32 }) print 'get it' print nodes_dataFrame community_list = set(list(nodes_dataFrame.community)) community_file = open(path_save_to + community_size_file, 'wb') writer = csv.writer(community_file) writer.writerow(['community_id', 'number_of_user']) for community in community_list: number = len(nodes_dataFrame[nodes_dataFrame.community == community]) writer.writerow([community, number]) community_file.close() userId_communityId_file_path = '/pegasus/harir/yangjinfeng/commitresult/community2/' path_save_to = '/pegasus/harir/Qianlong/data/network/community_size/' userId_communityId_file_list = get_dirlist(userId_communityId_file_path, key_word_list=['node-com.txt']) for file in userId_communityId_file_list: community_size_file_name = file.replace('.icpm.node-com.txt', '_communityId_size.csv') get_community_nodes( userId_communityId_file_path_name=userId_communityId_file_path + file, path_save_to=path_save_to, community_size_file=community_size_file_name, sep=',', names=['user_id', 'community'])
# for words_type in words_type_list: # print path # words_file = path + get_dirlist(path,key_word_list=[words_type])[0] # print words_file # save_to = image_save_to + directory_name + '/' + words_type + '/' # cloud = commmunityTopWordCloud(top_words_path_file=words_file,background_color=background_color,font_path=font_path) # print 'number of communities:', len(cloud.community_id_list) # # time.sleep(5) # cloud.plot_word_cloud(image_save_to=save_to,file_name_key_word=words_type,number_of_community=number_of_community,community_id_list=[],full_width_community=False) # print directory_name, words_type, cloud.full_width_community_id_list #for April for directory_name in directory_list: for words_type in words_type_list: path = top_words_path + directory_name + '/' + words_type + '/' print path words_file = path + get_dirlist(path, key_word_list=['all-langs' ])[0] print words_file save_to = image_save_to + directory_name + '/' cloud = commmunityTopWordCloud(top_words_path_file=words_file, background_color=background_color, font_path=font_path) print 'number of communities:', len(cloud.community_id_list) # time.sleep(5) cloud.plot_word_cloud(image_save_to=save_to, file_name_key_word=words_type, number_of_community=number_of_community, community_id_list=[], full_width_community=False) print directory_name, words_type, cloud.full_width_community_id_list
cycle_list = ['01_07', '08_14', '15_21', '22_28', '29_30'] community_file_path = '/pegasus/harir/yangjinfeng/commitresult4/community2/inoutOrder/' overlap_user_file_path = '/pegasus/harir/yangjinfeng/commitresult4/community2/' network_edge_file_path = '/pegasus/harir/yangjinfeng/commitresult4/network/' path_community_node_edge_save_to = '/pegasus/harir/Qianlong/data/April/network/node_edge/' id_label_file = network_edge_file_path + 'kloutScore_iDname.txt' verified_user_file = network_edge_file_path + 'kloutScore_iDname.txt' community_size = 2000 community_number = 8 number_of_top_users = 1000 label_users_number = 20 for cycle in cycle_list: community_user_ordered_file = get_dirlist( path=community_file_path, key_word_list=[cycle, 'icpm_ordered'])[0] edge_file = get_dirlist(path=network_edge_file_path, key_word_list=[cycle, '-network_weighted'])[0] overlap_user_file = get_dirlist( path=overlap_user_file_path, key_word_list=[cycle, 'icpm.overlap.txt'])[0] print community_user_ordered_file print edge_file print overlap_user_file time.sleep(20) print community_user_ordered_file + 'is being processing.' print '*' * 100 save_node_file_name = community_user_ordered_file.replace( '.icpm_ordered', '') + '_nodes_top_' + str( number_of_top_users) + '_contain_verified' + '.csv'
def update_tweet(file_path, tweet_dataFrame_dict): """ update the info of each tweet in the dataFrame accroedig to other tweets. :param file_path: The path of directory that all files in. :param tweet_dataFrame_list: the list containing the tweet-dataFrame :param tweet_dataFrame_index_list: the list containing the index of each dataFrame in tweet_dataFrame_list. :return: updated tweet_dataFrame_list. """ tweet_dataFrame_index_dict = get_tweet_dataFrame_index_dict( tweet_dataFrame_dict) file_name_list = get_dirlist(file_path) file_index = 0 for file_name in file_name_list: file_index = file_index + 1 print file_index, 'UPDATING INFO OF TWEET...', file_name, 'is processing......' write_log(log_file_name='calculate_lifecycle_own_apart.log', log_file_path=os.getcwd(), information=str(file_index) + ': UPDATING INFO OF TWEET...' + str(file_name) + 'is being processed......') data_file = open(file_path + file_name, 'r') index = 0 for line in data_file: index += 1 row = json.loads(line) tweet_body = row['tweet']['body'] # 'reply' type, update info of tweet that the reply reply to. if row['type'] == 'reply': tweet_id = "00" + row['tweet']['inReplyTo'] tweet_index = whether_in_dict( str=tweet_id, dictionary=tweet_dataFrame_index_dict) if tweet_index != None: temp_time = compare_time( origin_time=tweet_dataFrame_dict[tweet_index]. end_time[tweet_id], new_time=row['tweet']['postedTime']) tweet_dataFrame_dict[tweet_index].loc[ [tweet_id], ['end_time']] = temp_time tweet_dataFrame_dict[tweet_index].loc[[tweet_id], ['reply_count']] += 1 # print index, 'PROCESSING TWEET... tweet type:', row[ 'type'], 'inReplyTo in the dataFrame and update "reply_count and end_time', '00' + row['tweet']['inReplyTo'] else: pass # 'tweet' type. # the condition that the user retweet someone's tweet and attached his own words: update info of the tweet that be retweeted if it is included in dataFrame. # the condition that a user posts a new tweet just contains his own origin content: do nothing. elif row[ 'type'] == 'tweet' and '://twitter.com/' in tweet_body and '/status/' in tweet_body: tweet_body_content_list = tweet_body.split('://twitter.com/') tweet_id_content = [ content.split('/status/')[1] for content in tweet_body_content_list if '/status/' in content ][0] tweet_id = '00' + tweet_id_content[:18] tweet_index = whether_in_dict( str=tweet_id, dictionary=tweet_dataFrame_index_dict) if tweet_index != None: temp_time = compare_time( origin_time=tweet_dataFrame_dict[tweet_index]. end_time[tweet_id], new_time=row['tweet']['postedTime']) tweet_dataFrame_dict[tweet_index].loc[ [tweet_id], ['end_time']] = temp_time tweet_dataFrame_dict[tweet_index].loc[ [tweet_id], ['retweet_count']] += 1 # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'update "end_time and retweet_count" of tweet:', tweet_id else: # print index , 'PROCESSING TWEET... tweet type:', row['type'], 'tweet:', tweet_id,'not in the dataFrame' pass # 'retwet' type elif row['type'] == 'retweet': origin_tweet_id = row['originTweet']['id'] origin_tweet_index = whether_in_dict( str=origin_tweet_id, dictionary=tweet_dataFrame_index_dict) if origin_tweet_index != None: temp_time = compare_time( origin_time=tweet_dataFrame_dict[origin_tweet_index]. end_time[origin_tweet_id], new_time=row['tweet']['postedTime']) tweet_dataFrame_dict[origin_tweet_index].loc[ [origin_tweet_id], ['end_time']] = temp_time tweet_dataFrame_dict[origin_tweet_index].loc[ [origin_tweet_id], ['retweet_count']] += 1 # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'originweet in the dataFrame and update "end_time and retweet_count" of tweet:', tweet_id else: # print index , 'PROCESSING TWEET... tweet type:', row['type'], 'originTweet not in the dataFrame' pass if '://twitter.com/' in tweet_body and '/status/' in tweet_body: tweet_body_content_list = tweet_body.split( '://twitter.com/') tweet_id_content = [ content.split('/status/')[1] for content in tweet_body_content_list if '/status/' in content ][0] tweet_id = '00' + tweet_id_content[:18] tweet_index = whether_in_dict( str=tweet_id, dictionary=tweet_dataFrame_index_dict) if tweet_index != None: temp_time = compare_time( origin_time=tweet_dataFrame_dict[tweet_index]. end_time[tweet_id], new_time=row['tweet']['postedTime']) tweet_dataFrame_dict[tweet_index].loc[ [tweet_id], ['end_time']] = temp_time tweet_dataFrame_dict[tweet_index].loc[ [tweet_id], ['retweet_count']] += 1 # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'body has twitter url, and updata "end_time and retweet_count" of tweet:', tweet_id else: # print index, 'PROCESSING TWEET... tweet type:', row['type'], 'body has twitter url, but not in the dataFrmae ' pass data_file.close() return tweet_dataFrame_dict
def main(): if len(sys.argv) != 3: print "Unknown Option \n usage: python %s file.scel new.txt" % (sys.argv[0]) exit(1) # Specify the param of scel path as a directory, you can place many scel file in this dirctory, the this process will combine the result in one txt file if os.path.isdir(sys.argv[1]): for fileName in glob.glob(sys.argv[1] + '*.scel'): print fileName generator = get_word_from_sogou_cell_dict(fileName) with open(sys.argv[2], "a") as f: store(generator, f) else: generator = get_word_from_sogou_cell_dict(sys.argv[1]) with open(sys.argv[2], "w") as f: store(generator, f) # showtxt(generator) if __name__ == "__main__": # main() path = './../resource/dict_scel/' file_name_list = get_dirlist(path=path, key_word_list=['.scel']) print file_name_list for file in file_name_list: print type(file) print file generator = get_word_from_sogou_cell_dict(path + file) with open('./../resource/dict_txt/' + file.replace('.scel','.txt'), "w") as f: store(generator, f)
# label_users_number = 20 # save_node_file_name = 'community_nodes.csv' # save_edge_file_name = 'community_edges.csv' # # # community_network = CommunityNetwork(community_size=community_size,community_number=community_number) # community_network.get_community_nodes(user_community_path_file=userId_communityId_file) # community_network.get_community_top_nodes(number_of_top_users=number_of_top_users,community_user_ordered_path_file=community_user_ordered_file,filter_verified_user=True,verified_user_path_file=verified_user_file) # community_network.get_community_edges(total_edge_weight_path_file=total_edge_file,sep = ',',wether_hash=False) # # community_network.filter_verified_user(verified_user_path_file= verified_user_file) # community_network.label_nodes(top_node_size=label_users_number,label_path_file= id_label_file) # community_network.community_nodes_dataFrame.to_csv(path_community_node_edge_save_to + save_node_file_name,index = False, header = True, columns = ['id','community_id','label']) # community_network.community_edges_dataFrame.to_csv(path_community_node_edge_save_to + save_edge_file_name, index = False, header= True, columns= ['source','target','weight']) community_file_path = '/pegasus/harir/yangjinfeng/commitresult/community2/' community_user_ordered_file_list = get_dirlist( path=community_file_path, key_word_list=['icpm_ordered']) print len(community_user_ordered_file_list) print community_user_ordered_file_list time.sleep(20) path_community_node_edge_save_to = '/pegasus/harir/Qianlong/data/network/node_edge/' qianlong_network_path = '/pegasus/harir/Qianlong/data/network/' community_size = 2000 community_number = 8 number_of_top_users = 1000 label_users_number = 20 id_label_file = qianlong_network_path + 'user_all_yang.csv' verified_user_file = qianlong_network_path + 'user_verified_long.csv' total_edge_file = '/pegasus/harir/sunweiwei/weight/total/' + 'total_network_weight' for community_user_ordered_file in community_user_ordered_file_list: print community_user_ordered_file + 'is being processing.'
# !/usr/bin/env python # -*- coding: utf-8 -*- import csv from utility.functions import get_dirlist import pandas as pd import time import numpy as np file_path = 'D:/node_edge/' file_name_list = get_dirlist(path=file_path, key_word_list=['nodes', '2016-'], no_key_word_list=['total']) print len(file_name_list) print file_name_list time.sleep(10) node_dataFrame = pd.DataFrame() node_dataFrame['id'] = None node_dataFrame['label'] = None node_dataFrame['2016-03-23'] = None node_dataFrame['2016-03-24'] = None node_dataFrame['2016-03-25'] = None node_dataFrame['2016-03-26'] = None node_dataFrame['2016-03-27'] = None node_dataFrame['2016-03-28'] = None node_dataFrame['2016-03-29'] = None node_dataFrame['2016-03-30'] = None node_dataFrame['2016-03-31'] = None day_list = [ '2016-03-23', '2016-03-24', '2016-03-25', '2016-03-26', '2016-03-27', '2016-03-28', '2016-03-29', '2016-03-30', '2016-03-31'
return True else: return True else: return True if __name__ == '__main__': origin_data_path = '/pegasus/twitter-p-or-t-uae-201603.json.dxb/' save_path = '/pegasus/harir/Qianlong/data/March/' # origin_data_path = 'F:/Twitter/April/' # save_path = 'F:/Twitter/April/' # file_name_list = get_dirlist(origin_data_path,key_word_list=['201604','.json']) file_name_list = get_dirlist( origin_data_path, key_word_list=['f424-4f7c-b21c-33b34d491577', '.json'], no_key_word_list=['.crc']) # file_name_list = [ 'twitter-p-or-t-uae-201604.json'] print len(file_name_list) print file_name_list time.sleep(10) total_number = 0 no_dubai_number = 0 dubai_strict_number = 0 dubai_no_strict_number = 0 for file_name in file_name_list: no_dubai_file = 'no_dubai_' + file_name dubai_strict_file = 'dubai_strict_' + file_name dubai_no_strict_file = 'dubai_no_strict_' + file_name filter_data = FilterData(origin_data_file=origin_data_path + file_name)