def get_section(sample):
	return sample.section

print("Reading samples.. ")
news_samples = preprocessing.news_sample.get_samples_multithread(_news_dir, _max_thread, _max_sample_count)

print("Preprocessing.. ")
news_samples = [sample for sample in news_samples if sample.word_count >= _min_word_count and sample.section in _section_filter]

random.shuffle(news_samples)
n_samples = len(news_samples)
train_samples = news_samples[0:int(n_samples*_train_ratio)]
test_samples = news_samples[int(n_samples*_train_ratio):n_samples]

print("Samples distribution:", preprocessing.samples_statistics(news_samples, _section_filter, get_section))
print("Train set distribution:", preprocessing.samples_statistics(train_samples, _section_filter, get_section))
print("Test set distribution:", preprocessing.samples_statistics(test_samples, _section_filter, get_section))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]
train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr, stem_words = _stem_words)

print("Generating labels..")
train_labels = preprocessing.samples_to_label(train_samples, _section_filter, get_section)
test_labels = preprocessing.samples_to_label(test_samples, _section_filter, get_section)

print("Training..")
kmeans = KMeans(n_clusters = len(_section_filter))
reference_output = kmeans.fit_predict(train_matrix)
示例#2
0
# NN parameters
_learning_rate = 1
_hidden_nodes = []

def get_question(sample):
	return sample.question

samples = preprocessing.tp_sample.get_samples(_sample_folder)
samples = [s for s in samples if s.batch_name == _batch_name and s.question is not None]
random.shuffle(samples)
n_samples = len(samples)
train_samples = samples[0:int(n_samples*_train_ratio)]
test_samples = samples[int(n_samples*_train_ratio):n_samples]

print("Samples distribution:", preprocessing.samples_statistics(samples, _classes, get_question))
print("Train set distribution:", preprocessing.samples_statistics(train_samples, _classes, get_question))
print("Test set distribution:", preprocessing.samples_statistics(test_samples, _classes, get_question))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]
train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False)

if _model == "SVM":
	train_labels = preprocessing.samples_to_label(train_samples, _classes, get_question)
	test_labels = preprocessing.samples_to_label(test_samples, _classes, get_question)

	model = SVM()
	model.train(train_matrix, train_labels)
	predict = model.predict(test_matrix)
示例#3
0
def get_section(sample):
	return sample.section

print("Reading samples.. ")
news_samples = preprocessing.news_sample.get_samples_multithread(_news_dir, _max_thread, _max_sample_count)

print("Preprocessing.. ")
news_samples = [sample for sample in news_samples if sample.word_count >= _min_word_count and sample.section in _section_filter]

random.shuffle(news_samples)
n_samples = len(news_samples)
train_samples = news_samples[0:int(n_samples*_train_ratio)]
test_samples = news_samples[int(n_samples*_train_ratio):n_samples]

print("Samples distribution:", preprocessing.samples_statistics(news_samples, _section_filter, get_section))
print("Train set distribution:", preprocessing.samples_statistics(train_samples, _section_filter, get_section))
print("Test set distribution:", preprocessing.samples_statistics(test_samples, _section_filter, get_section))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]
train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr,  stem_words = _stem_words)

print("Generating labels..")
if _model == "SVM":
	train_labels = preprocessing.samples_to_label(train_samples, _section_filter, get_section)
	test_labels = preprocessing.samples_to_label(test_samples, _section_filter, get_section)

	model = SVM()
	print("Training.. ")
	model.train(train_matrix, train_labels)
示例#4
0
random.shuffle(news_samples)
n_samples = len(news_samples)
_sections = _section_filter
if _section_group_map is not None:
    _sections = {section: True for section in _section_group_map.values()}
    _sections = list(_sections.keys())
    print("Grouped sections:", _sections)
    for sample in news_samples:
        sample.section = _section_group_map[sample.section]

train_samples = news_samples[0:int(n_samples * _train_ratio)]
test_samples = news_samples[int(n_samples * _train_ratio):n_samples]

print("Samples distribution:",
      preprocessing.samples_statistics(news_samples, _sections, get_section))
print("Train set distribution:",
      preprocessing.samples_statistics(train_samples, _sections, get_section))
print("Test set distribution:",
      preprocessing.samples_statistics(test_samples, _sections, get_section))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]

tfidf_vectorizer = get_tfidfVectorizer_of_essay_top_tf_words()
print("Vectorizer built..")
train_matrix, test_matrix, words = preprocessing.preprocess(
    train_texts,
    test_texts,
    savedir=_save_dir,
    words_src=tfidf_vectorizer,
示例#5
0
def get_question(sample):
    return sample.question


samples = preprocessing.tp_sample.get_samples(_sample_folder)
samples = [
    s for s in samples
    if s.batch_name == _batch_name and s.question is not None
]
random.shuffle(samples)
n_samples = len(samples)
train_samples = samples[0:int(n_samples * _train_ratio)]
test_samples = samples[int(n_samples * _train_ratio):n_samples]

print("Samples distribution:",
      preprocessing.samples_statistics(samples, _classes, get_question))
print("Train set distribution:",
      preprocessing.samples_statistics(train_samples, _classes, get_question))
print("Test set distribution:",
      preprocessing.samples_statistics(test_samples, _classes, get_question))

train_texts = [sample.text for sample in train_samples]
test_texts = [sample.text for sample in test_samples]
train_matrix, test_matrix, words = preprocessing.preprocess(
    train_texts, test_texts, words_src="samples", normalize_flag=False)

if _model == "SVM":
    train_labels = preprocessing.samples_to_label(train_samples, _classes,
                                                  get_question)
    test_labels = preprocessing.samples_to_label(test_samples, _classes,
                                                 get_question)