Пример #1
0
def get_types(fullset):
    results = []
    for row in fullset:
        disaster_prob_vec = row[:common.N_DISASTER]
        feature_vec = row[common.N_DISASTER:]

        categorized_vec = common.categorize(disaster_prob_vec, 0.4, 0.6)
        if common.is_certain(categorized_vec):
            label = common.classify(categorized_vec)
            results.append(label)
        elif -99 in feature_vec:
            results.append(-99)
        else:
            results.append(UNCERTAIN_LABEL)

    return np.array(results)
Пример #2
0
def get_codes(fullset):
    results = []
    for i, row in enumerate(fullset):
        disaster_prob_vec = row[:common.N_DISASTER]
        feature_vec = row[common.N_DISASTER:]

        categorized_vec = common.categorize(disaster_prob_vec, 0.4, 0.6)
        if common.is_certain(categorized_vec):
            label = common.classify(categorized_vec)
            results.append(label)
        elif -99 in feature_vec:
            results.append(-99)
        else:
            results.append(common.N_CLASS)

    return np.array(results)
Пример #3
0
def get_trainset(fullset, upward=False, with_id=False):
    result = []

    if upward:
        fullset = np.flipud(fullset.reshape(common.N_ROWS, common.N_COLS, -1)).reshape(-1, fullset.shape[-1])

    for row in fullset:
        disaster_prob_vec = row[:common.N_DISASTER]
        feature_vec = row[common.N_DISASTER:]

        categorized_vec = common.categorize(disaster_prob_vec, 0.4, 0.6)

        if common.is_certain(categorized_vec) and -99 not in feature_vec:
            label = common.classify(categorized_vec)
            result.append(feature_vec.tolist() + [label])

    if with_id:
        return np.concatenate((np.arange(len(result))[:, np.newaxis], result), axis=1)
    else:
        return np.array(result)
Пример #4
0
    """
    data = np.loadtxt(f'{DATA_PATH}/{data_name}')
    labels = np.loadtxt(f'{DATA_PATH}/{labels_name}')

    # Initialize dict to map class label to feature vector
    class_data = defaultdict(list)

    for k in range(len(labels)):
        # Save class specific data
        class_data[int(labels[k])].append(data[k])
    
    for k in class_data:
        class_data[k] = np.array(class_data[k])

        print(f'shape of class entry {k}: ', class_data[k].shape)

    return class_data


class_data = read_data('X_combined.txt', 'Y_combined.txt')
#render(class_data)

priors, gaussians, variances = create_distributions(class_data)

#render_pca(class_data, variances)

labeled_data, classifications = classify(class_data, gaussians, priors)

assess_classification(class_data, labeled_data, classifications, priors)

Пример #5
0
        documents = common.retrieve_reuters_documents(max_documents=max_documents)
        print('Loaded reuters documents')
    elif data_set == 'imdb':
        documents = common.retrieve_imdb_movie_reviews(max_documents=max_documents)
        print('Loaded imdb reviews')
    elif data_set == 'newsgroups':
        documents = common.retrieve_newsgroup_articles(max_documents=max_documents)
        print('Loaded newsgroup articles')
    else:
        documents = []

    for feature_extraction_algorithm in feature_extraction_algorithms:
        print('using {} algorithm on data set: {}'.format(feature_extraction_algorithm, data_set))

        if 'doc2vec' == feature_extraction_algorithm:
            doc2vec = common.create_or_load_doc2vec_model('model/{}-doc2vec-{}.bin'.format(data_set, len(documents)), documents)
            common.add_feature_vectors_doc2vec(documents, doc2vec)

        if 'word_count' == feature_extraction_algorithm:
            word_count_vectorizer = common.create_word_count_vectorizer(documents)
            common.add_feature_vectors_text_vectorizer(documents, word_count_vectorizer)

        # Visualize
        png_file_name = 'fig/{}-tsne-{}-{}.png'.format(data_set, feature_extraction_algorithm, len(documents))
        common.visualize(documents, png_file_name)

        # Classify
        print('classify data_set: {}, feature_extraction_algorithm: {}, num_documents: {}'.format(data_set, feature_extraction_algorithm, len(documents)))
        common.classify(documents)
        print()