예제 #1
0
def combine(category_id, category_list, category_wrapper_dict):
    print 'combing'
    tag_set = set([])
    editor_tag_set = getCategoryEditorTag(category_id)
    outfile = open('combine/' + str(category_id) + '.txt', 'wb')
    outfile2 = open('final/' + str(category_id) + '.txt', 'wb')
    for i in range(len(category_list)):
        for j in range(len(category_list)):
            if i != j:
                category_i = category_list[i]
                category_j = category_list[j]
                if len(category_i) < len(category_j):
                    if text_process.isSubset(category_i, category_j) or len(
                            set(category_i)
                            & set(category_j)) == len(category_i):
                        if category_i in category_wrapper_dict.keys():
                            category_wrapper_dict[category_i].append(
                                category_j)
    for category in category_wrapper_dict.keys():
        tag_set.add(category)
        for wrapper in category_wrapper_dict[category]:
            tag_set.add(wrapper)
        outfile.write(category + '@' +
                      ','.join(category_wrapper_dict[category]) + '\r\n')
    tag_set = tag_set | editor_tag_set
    for tag in tag_set:
        outfile2.write(tag + '\r\n')
def inclusionRelation(category_id, category_set):
    print 'extracting inclusion relation feature'
    category_feature_dict = {}
    for category in category_set:
        category_feature_dict.setdefault(category, 0)
    outfile = open('internal/' + str(category_id) + '.csv', 'wb')
    for i in range(len(category_feature_dict.keys())):
        for j in range(len(category_feature_dict.keys())):
            if i != j:
                word_i = category_feature_dict.keys()[i]
                word_j = category_feature_dict.keys()[j]
                if len(word_i) < len(word_j):
                    if text_process.isSubset(word_i,
                                             word_j) and len(word_i) != 1:
                        category_feature_dict[word_i] += 1

    inclusion_max = max(category_feature_dict.values())
    print 'sorting'
    sorted_list = sorted(category_feature_dict.items(),
                         key=lambda p: p[1],
                         reverse=True)
    print 'writing'
    for val in sorted_list:
        inclusion_normalize = 1.0 * val[1] / inclusion_max
        outfile.write(val[0] + ',' + str(inclusion_normalize) + '\r\n')
예제 #3
0
def getMainCategoryKeywords(main_category_list,category_info_dict):
	main_category_keywords = []
	for category in category_info_dict.keys():
		for relevant_category in main_category_list:
			if len(category) > len(relevant_category):
				if text_process.isSubset(relevant_category,category):
					main_category_keywords.append(category)
	return main_category_keywords