def main(): domain_dict,domain_count = load_train_ori() weight = dict() for k,v in domain_dict.items(): weight_dict = dict() for k1,v1 in v.items(): tf = Decimal(v1)/Decimal(domain_count[k]) idf = cal_idf(k1,domain_dict,domain_count,k) weight_dict[k1] = Decimal(tf)*Decimal(idf) weight[k] = weight_dict return weight,domain_dict
def main(): domain_dict, domain_count = load_train_ori() weight = dict() for k, v in domain_dict.items(): weight_dict = dict() for k1, v1 in v.items(): tf = Decimal(v1) / Decimal(domain_count[k]) idf = cal_idf(k1, domain_dict, domain_count, k) weight_dict[k1] = Decimal(tf) * Decimal(idf) weight[k] = weight_dict return weight, domain_dict
return domain_dict, domain_count def write_has(filename, has_word): n = len(has_word) keyword = TopkHeap(n) for k, v in has_word.items(): keyword.Push((v, k)) keyword_data = keyword.TopK() with open('./topic_dict/%s_ori.csv' % filename, 'wb') as f: writer = csv.writer(f) for i in range(0, len(keyword_data)): if keyword_data[i][0] > 1: writer.writerow((keyword_data[i][0], keyword_data[i][1])) if __name__ == '__main__': domain_dict, domain_count = load_train_ori() for j in name_list: print '%s start...' % j new_dict, new_count = read_csv(domain_dict[j], domain_count[j], j) #更新类型字典 print '%s end...' % j write_has(j, new_dict) #将结果写入文件
domain_count = domain_count + 1 return domain_dict,domain_count def write_has(filename,has_word): n = len(has_word) keyword = TopkHeap(n) for k,v in has_word.items(): keyword.Push((v,k)) keyword_data = keyword.TopK() with open('./topic_dict/%s_ori.csv' % filename, 'wb') as f: writer = csv.writer(f) for i in range(0,len(keyword_data)): if keyword_data[i][0] > 1: writer.writerow((keyword_data[i][0],keyword_data[i][1])) if __name__ == '__main__': domain_dict,domain_count = load_train_ori() for j in name_list: print '%s start...' % j new_dict,new_count = read_csv(domain_dict[j],domain_count[j],j)#更新类型字典 print '%s end...' % j write_has(j,new_dict)#将结果写入文件