def main(raw_data_f , data_label , drmodel_f , out_f) :
    trans_model , gram_n , cluster_num = load_dr_model(drmodel_f)
    logging.info("gram_n : %d " %(gram_n ))
    logging.info("cluster_num : %d" %(cluster_num))
    for line in raw_data_f :
        c = unordered_ngram_tokenizer(line , gram_n)
        svm_data = generate_data_in_svm_format(c.keys() , trans_model , cluster_num , data_label)
        out_f.write(svm_data)
Пример #2
0
def main(raw_data_f, data_label, drmodel_f, out_f):
    trans_model, gram_n, cluster_num = load_dr_model(drmodel_f)
    logging.info("gram_n : %d " % (gram_n))
    logging.info("cluster_num : %d" % (cluster_num))
    for line in raw_data_f:
        c = unordered_ngram_tokenizer(line, gram_n)
        svm_data = generate_data_in_svm_format(c.keys(), trans_model,
                                               cluster_num, data_label)
        out_f.write(svm_data)
def stat_class_info(class_f , ngram) :
    '''
    input > class_f : the trainning corpus file  of one class . Here , every line in class_f , represent a [doc] !
    return > a Counter  : ( (words : df ) ,) , means including the words and its DF num
    '''
    c = Counter()
    for line in class_f :
        doc_container = unordered_ngram_tokenizer(line , ngram)
        doc_words_set = set(doc_container)
        c.update(doc_words_set) # here update , should update the words and it's DF
    #for key , df in c.items() :
    #    print "%s %d" %(key , df)
    return c
Пример #4
0
def stat_class_info(class_f, ngram):
    '''
    input > class_f : the trainning corpus file  of one class . Here , every line in class_f , represent a [doc] !
    return > a Counter  : ( (words : df ) ,) , means including the words and its DF num
    '''
    c = Counter()
    for line in class_f:
        doc_container = unordered_ngram_tokenizer(line, ngram)
        doc_words_set = set(doc_container)
        c.update(
            doc_words_set)  # here update , should update the words and it's DF
    #for key , df in c.items() :
    #    print "%s %d" %(key , df)
    return c