def main(category_path):
    reload(sys)
    sys.setdefaultencoding("utf-8")

    category_path_list = category_path.split("_")
    category_id = int(category_path_list[0])
    query_category = ""
    if len(category_path_list) >= 2:
        query_category = category_path_list[-1].decode("utf-8")
    main_category_list = [query_category]

    file_utils.createDirs(["baidu_baike_search_hierarchy"])
    file_path_list = file_utils.getFilePathList("../../scrapy/baidu_baike_search/clean_data/")
    category_info_dict = readCategoryInfo(file_path_list)
    g = generateCategoryNetwork(main_category_list, category_info_dict)
    hierarchy_node_dict = getHierarchy(g, category_info_dict, main_category_list)

    hierarchy_max_dict = {}
    category_hierarchy_score_dict = {}
    for query_category in category_info_dict.keys():
        calculateRelation(g, hierarchy_node_dict, hierarchy_max_dict, category_hierarchy_score_dict, query_category)
        # break

    outfile = open("baidu_baike_search_hierarchy/" + str(category_path) + ".csv", "wb")
    for category in category_hierarchy_score_dict.keys():
        outlist = []
        for level in category_hierarchy_score_dict[category].keys():
            score_nomalize = 0
            if hierarchy_max_dict[level] != 0:
                score_nomalize = 1.0 * category_hierarchy_score_dict[category][level] / hierarchy_max_dict[level]
            outlist.append(score_nomalize)
        best_level = -1
        if max(outlist) != 0:
            best_level = outlist.index(max(outlist)) + 1
        outfile.write(category + "," + ",".join([str(val) for val in outlist]) + "," + str(best_level) + "\r\n")
def main(category_path):
	reload(sys)
	sys.setdefaultencoding('utf-8')

	category_path_list = category_path.split('_')
	category_id = int(category_path_list[0])
	query_category = ""

	query_category = category_path_list[-1].decode('utf-8')
	category_relevant_set = getMainCategoryRevelantWord(query_category)
	main_category_list = list(category_relevant_set)

	file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike_search/clean_data/17/')
	category_info_dict = readCategoryInfo(file_path_list)
	file_utils.createDirs(['baidu_baike_search'])

	extractFeature(category_id,category_path,main_category_list,category_info_dict)
Exemplo n.º 3
0
def read():
	print 'reading browse data'
	tag_fre_dict = {}
	main_sub_dict = {}
	file_path_list = file_utils.getFilePathList('data/browse_150708/')
	for file_path in file_path_list:
		if 'crc' in file_path or 'swp' in file_path or 'swo' in file_path:
			continue
		# print file_path
		infile = gzip.open(file_path)
		while True:
			row = infile.readline()
			if not row: break
			items = row.strip().split('\x01')
			logtime = items[0].decode('utf-8','ignore')
			ip = items[1].decode('utf-8','ignore')
			fm = items[2].decode('utf-8','ignore')

			# main_tag = "null"
			# for fm_item in fm.split('_'):
			# 	if 'tag' in fm_item and 'tagapplist' not in fm_item:
			# 		main_tag = fm_item
			# 	if 'tagapplist' in fm_item and main_tag != 'null':
			# 		main_sub_dict.setdefault(main_tag,{}).setdefault(fm_item,0)
			# 		main_sub_dict[main_tag][fm_item] += 1

			pattern = re.compile(r'tag[^_]+')
			match_list = pattern.findall(fm)
			for item in match_list:
				tag_fre_dict.setdefault(item,0)
				tag_fre_dict[item] += 1

		# break

	# for main_tag in main_sub_dict.keys():
	# 	print main_tag
	# 	sorted_list = sorted(main_sub_dict[main_tag].items(),key=lambda p:p[1],reverse=True)
	# 	for val in sorted_list:
	# 		print '   '+val[0]+' '+str(val[1])

	sorted_list = sorted(tag_fre_dict.items(),key=lambda p:p[1],reverse=True)
	for val in sorted_list:
		print val[0]+' '+str(val[1])
	print len(sorted_list)
Exemplo n.º 4
0
def main(category_path):
    reload(sys)
    sys.setdefaultencoding('utf-8')

    category_path_list = category_path.split('_')
    category_id = int(category_path_list[0])
    query_category = ""

    query_category = category_path_list[-1].decode('utf-8')
    category_relevant_set = getMainCategoryRevelantWord(query_category)
    main_category_list = list(category_relevant_set)

    file_path_list = file_utils.getFilePathList(
        '../../scrapy/baidu_baike_search/clean_data/17/')
    category_info_dict = readCategoryInfo(file_path_list)
    file_utils.createDirs(['baidu_baike_search'])

    extractFeature(category_id, category_path, main_category_list,
                   category_info_dict)
Exemplo n.º 5
0
def read():
	print 'reading browse data'
	file_path_list = file_utils.getFilePathList('data/search/')
	for file_path in file_path_list:
		if 'crc' in file_path or 'swp' in file_path or 'swo' in file_path:
			continue
		# print file_path
		infile = gzip.open(file_path)
		while True:
			row = infile.readline()
			if not row: break
			items = row.strip().split('\x01')
			log_time = items[0].decode('utf-8','ignore')
			ip = items[1].decode('utf-8','ignore')
			fm = items[2].decode('utf-8','ignore')
			inp = items[3].decode('gbk','ignore')
			print inp

		infile.close()
Exemplo n.º 6
0
def main(category_path):
	reload(sys)
	sys.setdefaultencoding('utf-8')

	category_path_list = category_path.split('_')
	category_id = int(category_path_list[0])
	query_category = ""
	if len(category_path_list) >= 2:
		query_category = category_path_list[-1].decode('utf-8')
	main_category_list = [query_category]

	file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike/crawl_data/'+str(category_id)+'/clean/')
	# file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike_search/clean_data/')
	category_info_dict = readCategoryInfo(file_path_list)
	file_utils.createDirs(['baidu_baike'])
	# file_utils.createDirs(['baidu_baike_search'])

	sub_category_list = category_info_dict.keys()
	extractFeature(category_id,category_path,main_category_list,sub_category_list,category_info_dict)
def main(category_path):
    reload(sys)
    sys.setdefaultencoding('utf-8')

    category_path_list = category_path.split('_')
    category_id = int(category_path_list[0])
    query_category = ""
    if len(category_path_list) >= 2:
        query_category = category_path_list[-1].decode('utf-8')
    main_category_list = [query_category]

    file_utils.createDirs(['baidu_baike_search_hierarchy'])
    file_path_list = file_utils.getFilePathList(
        '../../scrapy/baidu_baike_search/clean_data/')
    category_info_dict = readCategoryInfo(file_path_list)
    g = generateCategoryNetwork(main_category_list, category_info_dict)
    hierarchy_node_dict = getHierarchy(g, category_info_dict,
                                       main_category_list)

    hierarchy_max_dict = {}
    category_hierarchy_score_dict = {}
    for query_category in category_info_dict.keys():
        calculateRelation(g, hierarchy_node_dict, hierarchy_max_dict,
                          category_hierarchy_score_dict, query_category)
        # break

    outfile = open(
        'baidu_baike_search_hierarchy/' + str(category_path) + '.csv', 'wb')
    for category in category_hierarchy_score_dict.keys():
        outlist = []
        for level in category_hierarchy_score_dict[category].keys():
            score_nomalize = 0
            if hierarchy_max_dict[level] != 0:
                score_nomalize = 1.0 * category_hierarchy_score_dict[category][
                    level] / hierarchy_max_dict[level]
            outlist.append(score_nomalize)
        best_level = -1
        if max(outlist) != 0:
            best_level = outlist.index(max(outlist)) + 1
        outfile.write(category + ',' + ','.join([str(val)
                                                 for val in outlist]) + ',' +
                      str(best_level) + '\r\n')
def main(category_path):
	reload(sys)
	sys.setdefaultencoding('utf-8')

	category_path_list = category_path.split('_')
	category_id = int(category_path_list[0])
	query_category = ""
	if len(category_path_list) >= 2:
		query_category = category_path_list[-1].decode('utf-8')
	main_category_list = [query_category]

	file_utils.createDirs(['baidu_baike_hierarchy'])
	file_path_list = file_utils.getFilePathList('../../scrapy/baidu_baike/crawl_data/'+str(category_id)+'/clean/')
	category_info_dict = readCategoryInfo(file_path_list)
	g = generateCategoryNetwork(main_category_list,category_info_dict)
	hierarchy_node_dict = getHierarchy(g,category_info_dict,main_category_list)


	hierarchy_max_dict = {}
	category_hierarchy_score_dict = {}
	for query_category in category_info_dict.keys():
		calculateRelation(g,hierarchy_node_dict,hierarchy_max_dict,category_hierarchy_score_dict,query_category)
		# break

	outfile = open('baidu_baike_hierarchy/'+str(category_path)+'.csv','wb')
	for category in category_hierarchy_score_dict.keys():
		outlist = []
		for level in category_hierarchy_score_dict[category].keys():
			score_nomalize = 0
			if hierarchy_max_dict[level] != 0:
				score_nomalize = 1.0*category_hierarchy_score_dict[category][level]/hierarchy_max_dict[level]
			outlist.append(score_nomalize)
		best_level = -1
		if max(outlist) != 0:
			best_level = outlist.index(max(outlist))+1
		outfile.write(category+','+','.join([str(val) for val in outlist])+','+str(best_level)+'\r\n')