def main(raw_data_f , data_label , drmodel_f , out_f) : trans_model , gram_n , cluster_num = load_dr_model(drmodel_f) logging.info("gram_n : %d " %(gram_n )) logging.info("cluster_num : %d" %(cluster_num)) for line in raw_data_f : c = unordered_ngram_tokenizer(line , gram_n) svm_data = generate_data_in_svm_format(c.keys() , trans_model , cluster_num , data_label) out_f.write(svm_data)
def main(raw_data_f, data_label, drmodel_f, out_f): trans_model, gram_n, cluster_num = load_dr_model(drmodel_f) logging.info("gram_n : %d " % (gram_n)) logging.info("cluster_num : %d" % (cluster_num)) for line in raw_data_f: c = unordered_ngram_tokenizer(line, gram_n) svm_data = generate_data_in_svm_format(c.keys(), trans_model, cluster_num, data_label) out_f.write(svm_data)
def stat_class_info(class_f , ngram) : ''' input > class_f : the trainning corpus file of one class . Here , every line in class_f , represent a [doc] ! return > a Counter : ( (words : df ) ,) , means including the words and its DF num ''' c = Counter() for line in class_f : doc_container = unordered_ngram_tokenizer(line , ngram) doc_words_set = set(doc_container) c.update(doc_words_set) # here update , should update the words and it's DF #for key , df in c.items() : # print "%s %d" %(key , df) return c
def stat_class_info(class_f, ngram): ''' input > class_f : the trainning corpus file of one class . Here , every line in class_f , represent a [doc] ! return > a Counter : ( (words : df ) ,) , means including the words and its DF num ''' c = Counter() for line in class_f: doc_container = unordered_ngram_tokenizer(line, ngram) doc_words_set = set(doc_container) c.update( doc_words_set) # here update , should update the words and it's DF #for key , df in c.items() : # print "%s %d" %(key , df) return c