Exemplo n.º 1
0
def test_naive_bayes(hypothises):
    fp = FileProcessor(test_filepath, ' ')
    parsed_lines = fp.get_lines_as_array()
    results = []
    for row in parsed_lines:
        exclude_label = row[1:]
        max_sum = -float('Inf')
        max_label = -1

        for label in label_mappings:
            label_instance = hypothises[label]
            log_prior = log(label_instance.get_prior(), 2)
            densities = label_instance.get_densities()
            log_sum = 0

            for word in exclude_label:
                log_sum += log(densities[word], 2)
                    
            cur_sum = log_sum + log_prior
            if cur_sum > max_sum:
                max_sum = cur_sum
                max_label = label

        results.append(label_mappings[max_label])

    fp.generate_output(output_filepath, results)
def create_vocabulary():
    fp = FileProcessor(vocabulary_filepath, delimiter)
    parsed_data = fp.get_lines_as_array()
    for row in parsed_data:
        word = row[0]
        frequency = row[1]
        vocabulary[word] = frequency
Exemplo n.º 3
0
def process_data(training_file, testing_file, training_output, testing_output, attribute_descriptors):
    delimiter = ','
    training_file_processor = FileProcessor(training_file, delimiter)
    testing_file_processor = FileProcessor(testing_file, delimiter)
    training_lines = training_file_processor.get_lines_as_array()
    testing_lines = testing_file_processor.get_lines_as_array()
    all_lines = training_lines + testing_lines
    knn_processor = KNNProcess(all_lines, attribute_descriptors)
    imputed_lines = map(lambda line_no: knn_processor.replace_missing_line(line_no), range(0, len(all_lines)))
    normalized_lines = map(lambda line: knn_processor.normalize_line(line), imputed_lines)
    for line_no, line in enumerate(normalized_lines[:len(training_lines)]):
        training_file_processor.set_line(line_no, line)
    for line_no, line in enumerate(normalized_lines[len(training_lines):]):
        testing_file_processor.set_line(line_no, line)
    if training_file_processor.generate_output(training_output) and testing_file_processor.generate_output(testing_output):
        print 'Success!'
Exemplo n.º 4
0
def init_corpus_sizes():
    fp = FileProcessor(training_metadata_filepath, '/')
    parsed_lines = fp.get_lines_as_array()
    for row in parsed_lines:
        label = row[0]
        if label in corpus_sizes:
            corpus_sizes[label] += 1
        else:
            corpus_sizes[label] = 1
Exemplo n.º 5
0
def init_corpus():
    for label in label_mappings:
        training_filepath = get_training_filepath(label)
        fp = FileProcessor(training_filepath, ' ')
        parsed_lines = fp.get_lines_as_array()
        label_map = {}
        for row in parsed_lines:
            word = row[0]
            frequency = row[1]
            label_map[word] = frequency
        corpus[label] = label_map