Python preprocessing 예제들, processing.preprocessing Python 예제들

예제 #1

0

파일 보기

파일: main.py 프로젝트: Battleman/Space-ML

def main(input_, format_):
    """Predicts the Netflix ratings using a Neural Network model

    More precisely, it preprocesses the data to make it compatible with the
    NN model which is then trained and finally postprocesses the result
    to give predictions the desired format.

    Args:
        input_: The samples
        format_: The submission format file

    Returns:
        np.array: The predictions of the ratings
    """
    # preprocessing
    X_all, y, n_users, n_movies = preprocessing(input_)

    # Build the NN model and train it
    n_factors = 4
    model = NN_model(n_users + 1, n_movies + 1, n_factors)
    _ = model.fit(x=X_all, y=y, batch_size=128, epochs=10, verbose=1)

    # generating the predictions
    print("Generating predictions ...")
    predictions = predict(model, format_)

    return predictions

예제 #2

0

파일 보기

def hierarchical(df,
                 y,
                 n_clusters=2,
                 scaling=True,
                 features=0,
                 show_dendrogram=False):
    df = preprocessing(data=df, y=df[y], perform_scale=scaling)
    if features == 0:
        df = feature_selection(df=df, target=df[y], show_process=False)
    elif features == 1:
        df = pca(df.loc[:, df.columns.difference([y])],
                 df[y],
                 2,
                 show_result=False)
    print(df)
    x = df.loc[:, df.columns.difference([y])]
    plt.title('Hierarchical Clustering Dendrogram')
    plt.xlabel(y)
    plt.ylabel('distance')

    if show_dendrogram:
        dendrogram(
            linkage(x, 'ward'),  # generate the linkage matrix
            leaf_font_size=8  # font size for the x axis labels
        )

        plt.axhline(y=8)
        plt.show()

    clusters = AgglomerativeClustering(linkage="ward", n_clusters=n_clusters)
    clusters.fit(x)

    print(clusters.labels_)

    colors = np.array(['darkgrey', 'lightsalmon', 'powderblue'])

    plt.subplot(2, 2, 1)
    for i in np.unique(clusters.labels_):
        plt.scatter(x.iloc[clusters.labels_ == i, 0],
                    x.iloc[clusters.labels_ == i, 1],
                    color=colors[i % 3],
                    label='Cluster ' + str(i + 1))
    plt.legend()
    plt.title('Hierarchical Clustering')
    plt.xlabel(x.columns[0])
    plt.ylabel(x.columns[1])

    plt.subplot(2, 2, 2)
    for i in np.unique(df[y].values):
        plt.scatter(x.iloc[df[y].values == i, 0],
                    x.iloc[df[y].values == i, 1],
                    color=colors[i % 3],
                    label='Cluster ' + str(i + 1))
    plt.legend()
    plt.title('Ground Truth Classification')
    plt.xlabel(x.columns[0])
    plt.ylabel(x.columns[1])
    plt.show()

예제 #3

0

파일 보기

def input_comprison(data, y, model=None, log_matrix=True, out_path=None):
    res = []
    df = preprocessing(data=data, y=data[y])
    res.append(model(df.loc[:, df.columns.difference([y])], df[y]))
    res[0].append('Label Encoded')

    df = preprocessing(data=data, y=data[y], perform_ohe=True)
    res.append(model(df.loc[:, df.columns.difference([y])], df[y]))
    res[1].append('One Hot Encoded')

    df = preprocessing(data=data,
                       y=data[y],
                       perform_scale=True,
                       perform_ohe=True)
    res.append(model(df.loc[:, df.columns.difference([y])], df[y]))
    res[2].append('Standard Scaled')

    df = preprocessing(data=data, y=data[y], perform_scale=True)
    df = feature_selection(df=df, target=df[y], show_process=False)
    res.append(model(df.loc[:, df.columns.difference([y])], df[y]))
    res[3].append('Feature Selection')

    df = preprocessing(data=data, y=data[y], perform_scale=False)
    df = pca(df.loc[:, df.columns.difference([y])],
             df[y],
             0.9,
             show_result=True)
    res.append(model(df.loc[:, df.columns.difference([y])], df[y]))
    res[4].append('PCA')

    df = pd.DataFrame(res,
                      columns=[
                          'Accuracy Score', 'Confusion Matrix',
                          'Training Time', 'Predict Time', 'Processing'
                      ])
    if log_matrix:
        print(model.__name__)
        print(df)
    if out_path != None:
        df.to_csv(out_path)
    return df

예제 #4

0

파일 보기

def dbscan(df, y, eps=0.5, min_samples=5, features=0):
    df = preprocessing(data=df, y=df[y], perform_scale=True)
    if features == 0:
        df = feature_selection(df=df, target=df[y], show_process=False)
    elif features == 1:
        df = pca(df.loc[:, df.columns.difference([y])],
                 df[y],
                 0.8,
                 show_result=False)
    x = df.loc[:, df.columns != y]

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan.fit_predict(x)

    colors = np.array(['darkgrey', 'lightsalmon', 'powderblue'])

    plt.subplot(2, 2, 1)
    for i in np.unique(clusters):
        label = 'Outlier' if i == -1 else 'Cluster ' + str(i + 1)
        plt.scatter(x.iloc[clusters == i, 0],
                    x.iloc[clusters == i, 1],
                    color=colors[i % 3],
                    label=label)

    plt.legend()
    plt.title('DBSCAN Clustering')
    plt.xlabel(x.columns[0])
    plt.ylabel(x.columns[1])

    plt.subplots_adjust(wspace=0.4)

    plt.subplot(2, 2, 2)
    for i in np.unique(df[y].values):
        plt.scatter(x.iloc[df[y].values == i, 0],
                    x.iloc[df[y].values == i, 1],
                    color=colors[i % 3],
                    label='Cluster ' + str(i + 1))
    plt.legend()
    plt.title('Ground Truth Classification')
    plt.xlabel(x.columns[0])
    plt.ylabel(x.columns[1])
    plt.show()

예제 #5

0

파일 보기

def kmeans(df, y, n_clusters=2, features=0, show_elbow=False):
    df = preprocessing(data=df, y=df[y], perform_scale=True)
    if features == 0:
        df = feature_selection(df=df, target=df[y], show_process=False)
    elif features == 1:
        df = pca(df.loc[:, df.columns.difference([y])],
                 df[y],
                 0.8,
                 show_result=False)
    x = df.loc[:, df.columns.difference([y])]

    # Applying kmeans to the dataset / Creating the kmeans classifier
    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(x.values)

    # 2D plot
    colors = np.array(['darkgrey', 'lightsalmon', 'powderblue'])

    plt.subplot(2, 2, 1)
    for i in np.unique(clusters):
        plt.scatter(x.iloc[clusters == i, 0],
                    x.iloc[clusters == i, 1],
                    color=colors[i % 3],
                    label='Cluster ' + str(i + 1))

    # Plotting the centroids of the clusters
    plt.scatter(kmeans.cluster_centers_[:, 0],
                kmeans.cluster_centers_[:, 1],
                s=100,
                c='lightskyblue',
                label='Centroids')
    plt.legend()
    plt.title('K-Means Clustering')
    plt.xlabel(x.columns[0])
    plt.ylabel(x.columns[1])

    plt.subplot(2, 2, 2)
    for i in np.unique(df[y].values):
        plt.scatter(x.iloc[df[y].values == i, 0],
                    x.iloc[df[y].values == i, 1],
                    color=colors[i % 3],
                    label='Cluster ' + str(i + 1))
    plt.legend()
    plt.title('Ground Truth Classification')
    plt.xlabel(x.columns[0])
    plt.ylabel(x.columns[1])
    plt.show()

    # Part 2: Find the optimum number of clusters for k-means
    if show_elbow:
        wcss = []
        # Trying kmeans for k=1 to k=10
        for i in range(1, 11):
            kmeans = KMeans(n_clusters=i, init='k-means++')
            kmeans.fit(x)
            wcss.append(kmeans.inertia_)

        # Plotting the results onto a line graph, allowing us to observe 'The elbow'
        plt.plot(range(1, 11), wcss)
        plt.title('The elbow method')
        plt.xlabel('Number of clusters')
        plt.ylabel('WCSS')  # within cluster sum of squares
        plt.show()

예제 #6

0

파일 보기

파일: tfidf.py 프로젝트: sajan-caissa/hotel_recommender_system

def get_results(city, no):
    processing.preprocessing()
    pre = open('preprocess1.txt')
    train_set = []
    line = pre.readline()
    while (line != ''):
        train_set.append(line)
        #print line
        line = pre.readline()
    #print train_set
    pos = open('positive-words.txt')
    neg = open('negative-words.txt')
    positive = []
    negative = []
    for i in pos.read().split():
        positive.append(i)

    for j in neg.read().split():
        negative.append(j)

    stopWords = stopwords.words('english')
    vectorizer = CountVectorizer(stop_words=stopWords)
    transformer = TfidfTransformer()

    #train_set=get_traindata()

    #l=[]
    #l.append(test_set)
    #l.append(test_set1)

    #trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
    #print vectorizer.get_feature_names()
    #testVectorizerArray = vectorizer.transform(test_set).toarray()
    #testVectorizerArray1 = vectorizer.transform(l[1]).toarray()
    #print 'Fit Vectorizer to train set', trainVectorizerArray
    #print 'Transform Vectorizer to test set', testVectorizerArray
    #print testVectorizerArray1[0]

    #transformer.fit(trainVectorizerArray)
    v = vectorizer.fit_transform(train_set)
    #print v.toarray()
    tfidf = transformer.fit_transform(v)
    #transformer.fit(testVectorizerArray)

    #tfidf = transformer.transform(trainVectorizerArray)
    #print tfidf.todense()

    #print("done in %0.3fs." % (time() - t0))
    #print nmf.components_
    # Inverse the vectorizer vocabulary to be able
    feature_names = vectorizer.get_feature_names()
    #print (feature_names)
    #if 'area' in feature_names:
    print(feature_names)

    print("\n")
    #-------

    nmf = decomposition.NMF(n_components=3, init='random',
                            random_state=0).fit(tfidf.todense())
    topic_list = []
    l = int(len(feature_names) / 5)
    #print l
    for topic_idx, topic in enumerate(nmf.components_):
        topic_list.append(topic.argsort()[:-l - 1:-1])

#print("Topic #%d:" % topic_idx)
#print (topic)
    #print "Hello----"
    #print topic_list

    train_target = []
    for arr in v.toarray():
        train_target.append(calculate_Topic(arr, topic_list))
    #print train_target
    #clf = MultinomialNB()
    #clf2= LinearSVC()
    #clf1=NearestCentroid()
    #clf.fit(tfidf.todense(),train_target)
    #clf1.fit(tfidf.todense(),train_target)
    #clf2.fit(tfidf.todense(),train_target)
    #print (clf.predict(X_test))
    #print (clf1.predict(X_test))
    #print (clf2.predict(X_test))
    #print "Hello"
    ch2 = SelectKBest(chi2, k=l * 2)
    X_train = ch2.fit_transform(tfidf.todense(), train_target)

    cs = ch2.scores_.argsort()[::-1]
    cs_featurenames = []
    cs = cs[:l * 2]
    for x in cs:
        cs_featurenames.append(feature_names[x])

    print(cs_featurenames)
    print "\n"

    nmf1 = decomposition.NMF(n_components=3, init='random',
                             random_state=0).fit(X_train)
    topic_list = []
    l = int(len(feature_names) / 5)
    #print l
    for topic_idx, topic in enumerate(nmf1.components_):
        z = topic.argsort()[:-l - 1:-1]
        topic_list.append(z)
        print("Topic #%d:---------------------------------------" % topic_idx)
        for y in z:
            print cs_featurenames[y]

#print (topic)
    #print "Hello----"
    #print topic_list
    train_target = []
    for arr in X_train:
        train_target.append(calculate_Topic(arr, topic_list))

    #---------
    #print "hello"
    #print train_target
    #print ch2.get_feature_names()
    #print X_train
    #print train_target
    #print "=--------------"
    #print ta
    #print X_test
    train_count = [0] * 4
    #print train_target
    for x in train_target:
        train_count[x] = train_count[x] + 1
    #print "hello"
    #print train_count

    clf = MultinomialNB()
    clf2 = LinearSVC()
    clf1 = NearestCentroid()
    clf.fit(X_train, train_target)
    clf1.fit(X_train, train_target)
    clf2.fit(X_train, train_target)
    dic = {}
    hotels = read_hotels(city, dic)
    temp = []
    for each in hotels:
        temp.append(
            calculate(vectorizer, transformer, train_count, ch2, each, clf,
                      clf1, clf2, positive, negative, train_set))
    res = []
    temp1 = numpy.array(temp).argsort()[::-1]
    #print temp1
    print "Top %d recommendations are as follows[in the FORMAT Index,(Hotel name,Location),Score]:\n" % no

    for g in temp1[:no]:
        print g, dic[g], temp[g]
        res.append(dic[g])

    return res

예제 #7

0

파일 보기

# ---------------------------------------------------------------------------------------------------------------
# %% Paths
path_geotiff = '../Data/images/'
path_shp = '../Data/shapefiles/'
image_name_train = 'geotiffs/2018_12_29_Tai_bigger.tiff'
label_path_train = 'labels/Tai/segmentation.shp'

# ---------------------------------------------------------------------------------------------------------------
# %% load plantation image
band, meta_train = load_geotiff(path_geotiff + image_name_train,
                                window=rasterio.windows.Window(
                                    1490, 4020, 530, 350))

# ---------------------------------------------------------------------------------------------------------------
# %% Generate new features
img_train = preprocessing(np.stack(band, axis=2))

# %% vizualize new features
Nband_kept = 10 - 1
fig, axs = plt.subplots(4,
                        4,
                        figsize=(12, 10),
                        gridspec_kw={
                            'hspace': 0.1,
                            'wspace': 0.1
                        })
show_image(img_train[:, :, Nband_kept + 1],
           meta_train['transform'],
           ax=axs[0][0],
           cmap='PiYG')
axs[0][0].set_title('NDVI', fontsize=10)

예제 #8

0

파일 보기

파일: movielens_data.py 프로젝트: spoilr/recsys-for-interactive-explanations-2020

                temp_dict = dict()
                temp_dict['user_rating'] = curr_rat['user_rating']
                temp_dict['user_rating_date'] = curr_rat['user_rating_date']
                temp_dict['user_id'] = 'x' + curr_rat['user_id']
                curr_rats.append(temp_dict)
            corrected_ratings[x] = curr_rats
    ratings = corrected_ratings

    corrected_films = dict()
    for x in films.keys():
        if x not in ids_to_del:
            corrected_films[x] = films[x]
    films = corrected_films
    assert len(ratings) == len(films)

    films, ratings_dict, compressed_test_ratings_dict, sims, movies_all_genres_matrix, movies_all_directors_matrix, movies_all_actors_matrix = preprocessing(
        ratings, films, movielens_data)
    start = time.time()

    MUR = 0.1
    MUG = 0.6
    MUA = 0.1
    MUD = 0.1

    nr_predictions, accuracy, rmse, mae, precision, recall, f1 = predictions(
        MUR, MUG, MUA, MUD, films, compressed_test_ratings_dict, ratings_dict,
        sims, movies_all_genres_matrix, movies_all_directors_matrix,
        movies_all_actors_matrix, movielens_data)

    # print results
    print("Number of user-items pairs: %d" % nr_predictions)
    print("Accuracy: %.2f " % accuracy)

예제 #9

0

파일 보기

        sns.barplot(y=res['Processing'],
                    x=res['Predict Time'],
                    hue=res.model,
                    data=res,
                    orient='h',
                    ax=axs[2])
        plt.legend(fontsize='small')
        plt.show()


# Regression: Test with wine dataset using linear regression and neural network
#             Test with one hot encoded mushroom dataset using linear regression
df = pd.read_csv('data/wine/winequality-red.csv')
y = 'quality'
df = preprocessing(data=df, y=df[y], perform_scale=True)
print(
    linear_regression(df.loc[:, df.columns.difference(['quality'])],
                      df['quality'],
                      log_result=False))
print(
    neural_network(df, df[y], is_regression=True, log_result=False, epochs=20))

df = pd.read_csv('data/mushroom/mushrooms.csv')
y = 'class'
df = df.dropna()
df = df.reset_index(drop=True)
df = preprocessing(data=df,
                   y=df[y],
                   perform_scale=True,
                   perform_ohe=True,

예제 #10

0

파일 보기

파일: tfidf.py 프로젝트: Sandy4321/hotel_recommender_system

def get_results(city,no):
	processing.preprocessing()
	pre=open('preprocess1.txt')
	train_set=[]
	line=pre.readline()
	while(line!=''):
		train_set.append(line)
		#print line
		line=pre.readline()
	#print train_set	
	pos=open('positive-words.txt')
	neg=open('negative-words.txt')
	positive=[]
	negative=[]
	for i in pos.read().split():
		positive.append(i)	

	for j in neg.read().split():
		negative.append(j)

	stopWords = stopwords.words('english')
	vectorizer = CountVectorizer(stop_words = stopWords)
	transformer = TfidfTransformer()

	#train_set=get_traindata()
	

	#l=[]
	#l.append(test_set)
	#l.append(test_set1)

	#trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
	#print vectorizer.get_feature_names()
	#testVectorizerArray = vectorizer.transform(test_set).toarray()
	#testVectorizerArray1 = vectorizer.transform(l[1]).toarray()
	#print 'Fit Vectorizer to train set', trainVectorizerArray
	#print 'Transform Vectorizer to test set', testVectorizerArray
	#print testVectorizerArray1[0]

	#transformer.fit(trainVectorizerArray)
	v= vectorizer.fit_transform(train_set)
	#print v.toarray()
	tfidf= transformer.fit_transform(v)
	#transformer.fit(testVectorizerArray)

	#tfidf = transformer.transform(trainVectorizerArray)
	#print tfidf.todense()

	#print("done in %0.3fs." % (time() - t0))
	#print nmf.components_
	# Inverse the vectorizer vocabulary to be able
	feature_names = vectorizer.get_feature_names()
	#print (feature_names)
	#if 'area' in feature_names: 
	print (feature_names)

	print ("\n")
	#-------

	nmf = decomposition.NMF(n_components=3, init='random',random_state=0).fit(tfidf.todense())
	topic_list=[]
	l= int(len(feature_names)/5)
	#print l
	for topic_idx, topic in enumerate(nmf.components_):
		topic_list.append(topic.argsort()[:-l-1:-1])
    
    #print("Topic #%d:" % topic_idx)
    #print (topic)
	#print "Hello----"
	#print topic_list


	train_target=[]	
	for arr in v.toarray():
		train_target.append(calculate_Topic(arr,topic_list))
	#print train_target
	#clf = MultinomialNB()
	#clf2= LinearSVC()
	#clf1=NearestCentroid()
	#clf.fit(tfidf.todense(),train_target)
	#clf1.fit(tfidf.todense(),train_target)
	#clf2.fit(tfidf.todense(),train_target)
	#print (clf.predict(X_test))
	#print (clf1.predict(X_test))
	#print (clf2.predict(X_test))
	#print "Hello"
	ch2 = SelectKBest(chi2, k=l*2)
	X_train = ch2.fit_transform(tfidf.todense(), train_target)

	cs= ch2.scores_.argsort()[::-1]
	cs_featurenames=[]
	cs=cs[:l*2]
	for x in cs:
		cs_featurenames.append(feature_names[x])

	print (cs_featurenames)
	print "\n"

	nmf1 = decomposition.NMF(n_components=3, init='random',random_state=0).fit(X_train)
	topic_list=[]
	l= int(len(feature_names)/5)
	#print l
	for topic_idx, topic in enumerate(nmf1.components_):
		z=topic.argsort()[:-l-1:-1]
		topic_list.append(z)
		print("Topic #%d:---------------------------------------" % topic_idx)
		for y in z:
			print cs_featurenames[y]
    #print (topic)
	#print "Hello----"    
	#print topic_list
	train_target=[]	
	for arr in X_train:
		train_target.append(calculate_Topic(arr,topic_list))

	#---------
	#print "hello"
	#print train_target
	#print ch2.get_feature_names()
	#print X_train
	#print train_target
	#print "=--------------"
	#print ta
	#print X_test
	train_count=[0]*4
	#print train_target
	for x in train_target:
		train_count[x]=train_count[x]+1
	#print "hello"
	#print train_count	


	clf = MultinomialNB()
	clf2= LinearSVC()
	clf1=NearestCentroid()
	clf.fit(X_train,train_target)
	clf1.fit(X_train,train_target)
	clf2.fit(X_train,train_target)	
	dic={}
	hotels=read_hotels(city,dic)
	temp=[]
	for each in hotels:
		temp.append(calculate(vectorizer,transformer,train_count,ch2,each,clf,clf1,clf2,positive,negative,train_set))
	res=[]	
	temp1=numpy.array(temp).argsort()[::-1]
	#print temp1
	print "Top %d recommendations are as follows[in the FORMAT Index,(Hotel name,Location),Score]:\n" %no

	for g in temp1[:no]:
		print g,dic[g],temp[g]
		res.append(dic[g])

	return res

예제 #11

0

파일 보기

def infer_on_stream(args, client):
    """
    Initialize the inference network, stream video to network,
    and output stats and video.

    :param args: Command line arguments parsed by `build_argparser()`
    :param client: MQTT client
    :return: None
    """
    # Initialise the class
    infer_network = Network()
    # Set Probability threshold for detections
    # prob_threshold = args.prob_threshold

    ### Load the model through `infer_network` ###
    infer_network.load_model(args.model)

    # Get input shape
    input_shape = infer_network.get_input_shape()

    ### Handle the input stream ###
    input_stream = VideoCapture(args.input)
    input_stream.open(args.input)

    # Out stream setup
    fourcc = VideoWriter_fourcc(*'mp4v')
    frames = 24
    # Grab the shape of the input, since it's requiered for cv.VideoWriter
    # without using it cv gets a buffer size error and crashes
    width = int(input_stream.get(3))
    height = int(input_stream.get(4))

    out = VideoWriter('/app/out/out.mp4', fourcc, frames, (width, height))

    ### Loop until stream is over ###
    while input_stream.isOpened():

        ### Read from the video capture ###
        flag, frame = input_stream.read()
        if not flag:
            break
        key_pressed = waitKey(60)

        ### Pre-process the image as needed ###
        preprocessed_frame = preprocessing(frame, input_shape)

        ### Start asynchronous inference for specified request ###
        infer_network.exec_net(preprocessed_frame, 0)

        ### Wait for the result ###
        # Paramater is request number not wait time
        status = infer_network.wait(0)

        ### Get the results of the inference request ###
        if status == 0:
            output_shape = infer_network.get_output(0)
            drawn_frame = draw_boxes(frame, output_shape, args, width, height)
            out.write(drawn_frame)
            sys.stdout.buffer.write(drawn_frame)
            sys.stdout.flush()

        ### TODO: Extract any desired stats from the results ###

        ### TODO: Calculate and send relevant information on ###
        ### current_count, total_count and duration to the MQTT server ###
        ### Topic "person": keys of "count" and "total" ###
        ### Topic "person/duration": key of "duration" ###

        ### TODO: Send the frame to the FFMPEG server ###

        ### Write an output image if `single_image_mode` ###
        # imwrite('out/output.png', frame)

    # Break if escape key pressed
        if key_pressed == 27:
            break

    # Release the out writer, capture, and destroy any OpenCV windows
    out.release()
    input_stream.release()
    destroyAllWindows()

예제 #12

0

파일 보기

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 17 20:23:57 2019

@author: pushap
"""
import numpy as np
from processing import preprocessing
path="/home/pushap/newdata/capturedImages/"
x,y=preprocessing(path)
np.save('x.npy',x)
np.save('y.npy',y)

예제 #13

0

파일 보기

    if data_type != 'netflix':
        # remove from ratings the missing films (that were missing info and hence were discarded)
        ids_to_del_rf = set(ratings.keys()).difference(set(films.keys()))
        ids_to_del_fr = set(films.keys()).difference(set(ratings.keys()))
        ids_to_del = ids_to_del_rf.union(ids_to_del_fr)

        corrected_ratings = dict()
        for x in ratings.keys():
            if x not in ids_to_del:
                curr_rats = []
                for curr_rat in ratings[x]:
                    temp_dict = dict()
                    temp_dict['user_rating'] = curr_rat['user_rating']
                    temp_dict['user_rating_date'] = curr_rat[
                        'user_rating_date']
                    temp_dict['user_id'] = 'x' + curr_rat['user_id']
                    curr_rats.append(temp_dict)
                corrected_ratings[x] = curr_rats
        ratings = corrected_ratings

        corrected_films = dict()
        for x in films.keys():
            if x not in ids_to_del:
                corrected_films[x] = films[x]
        films = corrected_films
        assert len(ratings) == len(films)

    _, ratings_dict, compressed_test_ratings_dict, _, _, _, _ = preprocessing(
        ratings, films, data_type)
    run_baselines(ratings_dict, compressed_test_ratings_dict, data_type)

예제 #14

0

파일 보기

파일: prediction_script.py 프로젝트: yangxhcaf/Cocoa_plantations_detection

metas = []
bounds = []

for window, filename in zip(windows, filenames):
    print(f'>>> loading {window} from {filename}')
    band, meta = load_geotiff(path_geotiff + filename, window=window)
    img_adj.append(contrast_adjust(band, (0.5, 99.5)))
    img.append(np.stack(band, axis=2))
    metas.append(meta)
    bounds.append(
        rasterio.transform.array_bounds(meta['height'], meta['width'],
                                        meta['transform']))

# %% -------------------------------------------------------------------------------------------------------------
# preprocess images
img = [preprocessing(im) for im in img]

# %% -------------------------------------------------------------------------------------------------------------
# reshape in 2D
X = [im.reshape(-1, im.shape[2]) for im in img]

# %% -------------------------------------------------------------------------------------------------------------
# Load the fitted KNN models
with open('../models/KNN_trained.pickle', 'rb') as src:
    fitted_model = pickle.load(src)

# %% -------------------------------------------------------------------------------------------------------------
# Predict all sample and post-process them
preds = []
for i, Xi in enumerate(X):
    print(f'>>> Detecting cocoa plantations on image {i+1}')