def main(category_path): reload(sys) sys.setdefaultencoding("utf-8") category_path_list = category_path.split("_") category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode("utf-8") main_category_list = [query_category] file_utils.createDirs(["baidu_baike_search_hierarchy"]) file_path_list = file_utils.getFilePathList("../../scrapy/baidu_baike_search/clean_data/") category_info_dict = readCategoryInfo(file_path_list) g = generateCategoryNetwork(main_category_list, category_info_dict) hierarchy_node_dict = getHierarchy(g, category_info_dict, main_category_list) hierarchy_max_dict = {} category_hierarchy_score_dict = {} for query_category in category_info_dict.keys(): calculateRelation(g, hierarchy_node_dict, hierarchy_max_dict, category_hierarchy_score_dict, query_category) # break outfile = open("baidu_baike_search_hierarchy/" + str(category_path) + ".csv", "wb") for category in category_hierarchy_score_dict.keys(): outlist = [] for level in category_hierarchy_score_dict[category].keys(): score_nomalize = 0 if hierarchy_max_dict[level] != 0: score_nomalize = 1.0 * category_hierarchy_score_dict[category][level] / hierarchy_max_dict[level] outlist.append(score_nomalize) best_level = -1 if max(outlist) != 0: best_level = outlist.index(max(outlist)) + 1 outfile.write(category + "," + ",".join([str(val) for val in outlist]) + "," + str(best_level) + "\r\n")
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" query_category = category_path_list[-1].decode('utf-8') category_relevant_set = getMainCategoryRevelantWord(query_category) main_category_list = list(category_relevant_set) file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike_search/clean_data/17/') category_info_dict = readCategoryInfo(file_path_list) file_utils.createDirs(['baidu_baike_search']) extractFeature(category_id,category_path,main_category_list,category_info_dict)
def read(): print 'reading browse data' tag_fre_dict = {} main_sub_dict = {} file_path_list = file_utils.getFilePathList('data/browse_150708/') for file_path in file_path_list: if 'crc' in file_path or 'swp' in file_path or 'swo' in file_path: continue # print file_path infile = gzip.open(file_path) while True: row = infile.readline() if not row: break items = row.strip().split('\x01') logtime = items[0].decode('utf-8','ignore') ip = items[1].decode('utf-8','ignore') fm = items[2].decode('utf-8','ignore') # main_tag = "null" # for fm_item in fm.split('_'): # if 'tag' in fm_item and 'tagapplist' not in fm_item: # main_tag = fm_item # if 'tagapplist' in fm_item and main_tag != 'null': # main_sub_dict.setdefault(main_tag,{}).setdefault(fm_item,0) # main_sub_dict[main_tag][fm_item] += 1 pattern = re.compile(r'tag[^_]+') match_list = pattern.findall(fm) for item in match_list: tag_fre_dict.setdefault(item,0) tag_fre_dict[item] += 1 # break # for main_tag in main_sub_dict.keys(): # print main_tag # sorted_list = sorted(main_sub_dict[main_tag].items(),key=lambda p:p[1],reverse=True) # for val in sorted_list: # print ' '+val[0]+' '+str(val[1]) sorted_list = sorted(tag_fre_dict.items(),key=lambda p:p[1],reverse=True) for val in sorted_list: print val[0]+' '+str(val[1]) print len(sorted_list)
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" query_category = category_path_list[-1].decode('utf-8') category_relevant_set = getMainCategoryRevelantWord(query_category) main_category_list = list(category_relevant_set) file_path_list = file_utils.getFilePathList( '../../scrapy/baidu_baike_search/clean_data/17/') category_info_dict = readCategoryInfo(file_path_list) file_utils.createDirs(['baidu_baike_search']) extractFeature(category_id, category_path, main_category_list, category_info_dict)
def read(): print 'reading browse data' file_path_list = file_utils.getFilePathList('data/search/') for file_path in file_path_list: if 'crc' in file_path or 'swp' in file_path or 'swo' in file_path: continue # print file_path infile = gzip.open(file_path) while True: row = infile.readline() if not row: break items = row.strip().split('\x01') log_time = items[0].decode('utf-8','ignore') ip = items[1].decode('utf-8','ignore') fm = items[2].decode('utf-8','ignore') inp = items[3].decode('gbk','ignore') print inp infile.close()
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode('utf-8') main_category_list = [query_category] file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike/crawl_data/'+str(category_id)+'/clean/') # file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike_search/clean_data/') category_info_dict = readCategoryInfo(file_path_list) file_utils.createDirs(['baidu_baike']) # file_utils.createDirs(['baidu_baike_search']) sub_category_list = category_info_dict.keys() extractFeature(category_id,category_path,main_category_list,sub_category_list,category_info_dict)
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode('utf-8') main_category_list = [query_category] file_utils.createDirs(['baidu_baike_search_hierarchy']) file_path_list = file_utils.getFilePathList( '../../scrapy/baidu_baike_search/clean_data/') category_info_dict = readCategoryInfo(file_path_list) g = generateCategoryNetwork(main_category_list, category_info_dict) hierarchy_node_dict = getHierarchy(g, category_info_dict, main_category_list) hierarchy_max_dict = {} category_hierarchy_score_dict = {} for query_category in category_info_dict.keys(): calculateRelation(g, hierarchy_node_dict, hierarchy_max_dict, category_hierarchy_score_dict, query_category) # break outfile = open( 'baidu_baike_search_hierarchy/' + str(category_path) + '.csv', 'wb') for category in category_hierarchy_score_dict.keys(): outlist = [] for level in category_hierarchy_score_dict[category].keys(): score_nomalize = 0 if hierarchy_max_dict[level] != 0: score_nomalize = 1.0 * category_hierarchy_score_dict[category][ level] / hierarchy_max_dict[level] outlist.append(score_nomalize) best_level = -1 if max(outlist) != 0: best_level = outlist.index(max(outlist)) + 1 outfile.write(category + ',' + ','.join([str(val) for val in outlist]) + ',' + str(best_level) + '\r\n')
def main(category_path): reload(sys) sys.setdefaultencoding('utf-8') category_path_list = category_path.split('_') category_id = int(category_path_list[0]) query_category = "" if len(category_path_list) >= 2: query_category = category_path_list[-1].decode('utf-8') main_category_list = [query_category] file_utils.createDirs(['baidu_baike_hierarchy']) file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike/crawl_data/'+str(category_id)+'/clean/') category_info_dict = readCategoryInfo(file_path_list) g = generateCategoryNetwork(main_category_list,category_info_dict) hierarchy_node_dict = getHierarchy(g,category_info_dict,main_category_list) hierarchy_max_dict = {} category_hierarchy_score_dict = {} for query_category in category_info_dict.keys(): calculateRelation(g,hierarchy_node_dict,hierarchy_max_dict,category_hierarchy_score_dict,query_category) # break outfile = open('baidu_baike_hierarchy/'+str(category_path)+'.csv','wb') for category in category_hierarchy_score_dict.keys(): outlist = [] for level in category_hierarchy_score_dict[category].keys(): score_nomalize = 0 if hierarchy_max_dict[level] != 0: score_nomalize = 1.0*category_hierarchy_score_dict[category][level]/hierarchy_max_dict[level] outlist.append(score_nomalize) best_level = -1 if max(outlist) != 0: best_level = outlist.index(max(outlist))+1 outfile.write(category+','+','.join([str(val) for val in outlist])+','+str(best_level)+'\r\n')