def test_naive_bayes(hypothises): fp = FileProcessor(test_filepath, ' ') parsed_lines = fp.get_lines_as_array() results = [] for row in parsed_lines: exclude_label = row[1:] max_sum = -float('Inf') max_label = -1 for label in label_mappings: label_instance = hypothises[label] log_prior = log(label_instance.get_prior(), 2) densities = label_instance.get_densities() log_sum = 0 for word in exclude_label: log_sum += log(densities[word], 2) cur_sum = log_sum + log_prior if cur_sum > max_sum: max_sum = cur_sum max_label = label results.append(label_mappings[max_label]) fp.generate_output(output_filepath, results)
def create_vocabulary(): fp = FileProcessor(vocabulary_filepath, delimiter) parsed_data = fp.get_lines_as_array() for row in parsed_data: word = row[0] frequency = row[1] vocabulary[word] = frequency
def process_data(training_file, testing_file, training_output, testing_output, attribute_descriptors): delimiter = ',' training_file_processor = FileProcessor(training_file, delimiter) testing_file_processor = FileProcessor(testing_file, delimiter) training_lines = training_file_processor.get_lines_as_array() testing_lines = testing_file_processor.get_lines_as_array() all_lines = training_lines + testing_lines knn_processor = KNNProcess(all_lines, attribute_descriptors) imputed_lines = map(lambda line_no: knn_processor.replace_missing_line(line_no), range(0, len(all_lines))) normalized_lines = map(lambda line: knn_processor.normalize_line(line), imputed_lines) for line_no, line in enumerate(normalized_lines[:len(training_lines)]): training_file_processor.set_line(line_no, line) for line_no, line in enumerate(normalized_lines[len(training_lines):]): testing_file_processor.set_line(line_no, line) if training_file_processor.generate_output(training_output) and testing_file_processor.generate_output(testing_output): print 'Success!'
def init_corpus_sizes(): fp = FileProcessor(training_metadata_filepath, '/') parsed_lines = fp.get_lines_as_array() for row in parsed_lines: label = row[0] if label in corpus_sizes: corpus_sizes[label] += 1 else: corpus_sizes[label] = 1
def init_corpus(): for label in label_mappings: training_filepath = get_training_filepath(label) fp = FileProcessor(training_filepath, ' ') parsed_lines = fp.get_lines_as_array() label_map = {} for row in parsed_lines: word = row[0] frequency = row[1] label_map[word] = frequency corpus[label] = label_map