示例#1
0
def main():
    
    domain_dict,domain_count = load_train_ori()
    weight = dict()

    for k,v in domain_dict.items():
        weight_dict = dict()
        for k1,v1 in v.items():
            tf = Decimal(v1)/Decimal(domain_count[k])
            idf = cal_idf(k1,domain_dict,domain_count,k)
            weight_dict[k1] = Decimal(tf)*Decimal(idf)
        weight[k] = weight_dict

    return weight,domain_dict    
示例#2
0
def main():

    domain_dict, domain_count = load_train_ori()
    weight = dict()

    for k, v in domain_dict.items():
        weight_dict = dict()
        for k1, v1 in v.items():
            tf = Decimal(v1) / Decimal(domain_count[k])
            idf = cal_idf(k1, domain_dict, domain_count, k)
            weight_dict[k1] = Decimal(tf) * Decimal(idf)
        weight[k] = weight_dict

    return weight, domain_dict
示例#3
0
    return domain_dict, domain_count


def write_has(filename, has_word):

    n = len(has_word)
    keyword = TopkHeap(n)

    for k, v in has_word.items():
        keyword.Push((v, k))

    keyword_data = keyword.TopK()

    with open('./topic_dict/%s_ori.csv' % filename, 'wb') as f:
        writer = csv.writer(f)
        for i in range(0, len(keyword_data)):
            if keyword_data[i][0] > 1:
                writer.writerow((keyword_data[i][0], keyword_data[i][1]))


if __name__ == '__main__':

    domain_dict, domain_count = load_train_ori()

    for j in name_list:
        print '%s start...' % j
        new_dict, new_count = read_csv(domain_dict[j], domain_count[j],
                                       j)  #更新类型字典
        print '%s end...' % j
        write_has(j, new_dict)  #将结果写入文件
示例#4
0
            domain_count = domain_count + 1

    return domain_dict,domain_count

def write_has(filename,has_word):

    n = len(has_word)
    keyword = TopkHeap(n)

    for k,v in has_word.items():
        keyword.Push((v,k))

    keyword_data = keyword.TopK()

    with open('./topic_dict/%s_ori.csv' % filename, 'wb') as f:
        writer = csv.writer(f)
        for i in range(0,len(keyword_data)):
            if keyword_data[i][0] > 1:
                writer.writerow((keyword_data[i][0],keyword_data[i][1]))

if __name__ == '__main__':

    domain_dict,domain_count = load_train_ori()
    
    for j in name_list:
        print '%s start...' % j
        new_dict,new_count = read_csv(domain_dict[j],domain_count[j],j)#更新类型字典
        print '%s end...' % j
        write_has(j,new_dict)#将结果写入文件