def main(input_, format_): """Predicts the Netflix ratings using a Neural Network model More precisely, it preprocesses the data to make it compatible with the NN model which is then trained and finally postprocesses the result to give predictions the desired format. Args: input_: The samples format_: The submission format file Returns: np.array: The predictions of the ratings """ # preprocessing X_all, y, n_users, n_movies = preprocessing(input_) # Build the NN model and train it n_factors = 4 model = NN_model(n_users + 1, n_movies + 1, n_factors) _ = model.fit(x=X_all, y=y, batch_size=128, epochs=10, verbose=1) # generating the predictions print("Generating predictions ...") predictions = predict(model, format_) return predictions
def hierarchical(df, y, n_clusters=2, scaling=True, features=0, show_dendrogram=False): df = preprocessing(data=df, y=df[y], perform_scale=scaling) if features == 0: df = feature_selection(df=df, target=df[y], show_process=False) elif features == 1: df = pca(df.loc[:, df.columns.difference([y])], df[y], 2, show_result=False) print(df) x = df.loc[:, df.columns.difference([y])] plt.title('Hierarchical Clustering Dendrogram') plt.xlabel(y) plt.ylabel('distance') if show_dendrogram: dendrogram( linkage(x, 'ward'), # generate the linkage matrix leaf_font_size=8 # font size for the x axis labels ) plt.axhline(y=8) plt.show() clusters = AgglomerativeClustering(linkage="ward", n_clusters=n_clusters) clusters.fit(x) print(clusters.labels_) colors = np.array(['darkgrey', 'lightsalmon', 'powderblue']) plt.subplot(2, 2, 1) for i in np.unique(clusters.labels_): plt.scatter(x.iloc[clusters.labels_ == i, 0], x.iloc[clusters.labels_ == i, 1], color=colors[i % 3], label='Cluster ' + str(i + 1)) plt.legend() plt.title('Hierarchical Clustering') plt.xlabel(x.columns[0]) plt.ylabel(x.columns[1]) plt.subplot(2, 2, 2) for i in np.unique(df[y].values): plt.scatter(x.iloc[df[y].values == i, 0], x.iloc[df[y].values == i, 1], color=colors[i % 3], label='Cluster ' + str(i + 1)) plt.legend() plt.title('Ground Truth Classification') plt.xlabel(x.columns[0]) plt.ylabel(x.columns[1]) plt.show()
def input_comprison(data, y, model=None, log_matrix=True, out_path=None): res = [] df = preprocessing(data=data, y=data[y]) res.append(model(df.loc[:, df.columns.difference([y])], df[y])) res[0].append('Label Encoded') df = preprocessing(data=data, y=data[y], perform_ohe=True) res.append(model(df.loc[:, df.columns.difference([y])], df[y])) res[1].append('One Hot Encoded') df = preprocessing(data=data, y=data[y], perform_scale=True, perform_ohe=True) res.append(model(df.loc[:, df.columns.difference([y])], df[y])) res[2].append('Standard Scaled') df = preprocessing(data=data, y=data[y], perform_scale=True) df = feature_selection(df=df, target=df[y], show_process=False) res.append(model(df.loc[:, df.columns.difference([y])], df[y])) res[3].append('Feature Selection') df = preprocessing(data=data, y=data[y], perform_scale=False) df = pca(df.loc[:, df.columns.difference([y])], df[y], 0.9, show_result=True) res.append(model(df.loc[:, df.columns.difference([y])], df[y])) res[4].append('PCA') df = pd.DataFrame(res, columns=[ 'Accuracy Score', 'Confusion Matrix', 'Training Time', 'Predict Time', 'Processing' ]) if log_matrix: print(model.__name__) print(df) if out_path != None: df.to_csv(out_path) return df
def dbscan(df, y, eps=0.5, min_samples=5, features=0): df = preprocessing(data=df, y=df[y], perform_scale=True) if features == 0: df = feature_selection(df=df, target=df[y], show_process=False) elif features == 1: df = pca(df.loc[:, df.columns.difference([y])], df[y], 0.8, show_result=False) x = df.loc[:, df.columns != y] dbscan = DBSCAN(eps=eps, min_samples=min_samples) clusters = dbscan.fit_predict(x) colors = np.array(['darkgrey', 'lightsalmon', 'powderblue']) plt.subplot(2, 2, 1) for i in np.unique(clusters): label = 'Outlier' if i == -1 else 'Cluster ' + str(i + 1) plt.scatter(x.iloc[clusters == i, 0], x.iloc[clusters == i, 1], color=colors[i % 3], label=label) plt.legend() plt.title('DBSCAN Clustering') plt.xlabel(x.columns[0]) plt.ylabel(x.columns[1]) plt.subplots_adjust(wspace=0.4) plt.subplot(2, 2, 2) for i in np.unique(df[y].values): plt.scatter(x.iloc[df[y].values == i, 0], x.iloc[df[y].values == i, 1], color=colors[i % 3], label='Cluster ' + str(i + 1)) plt.legend() plt.title('Ground Truth Classification') plt.xlabel(x.columns[0]) plt.ylabel(x.columns[1]) plt.show()
def kmeans(df, y, n_clusters=2, features=0, show_elbow=False): df = preprocessing(data=df, y=df[y], perform_scale=True) if features == 0: df = feature_selection(df=df, target=df[y], show_process=False) elif features == 1: df = pca(df.loc[:, df.columns.difference([y])], df[y], 0.8, show_result=False) x = df.loc[:, df.columns.difference([y])] # Applying kmeans to the dataset / Creating the kmeans classifier kmeans = KMeans(n_clusters=n_clusters) clusters = kmeans.fit_predict(x.values) # 2D plot colors = np.array(['darkgrey', 'lightsalmon', 'powderblue']) plt.subplot(2, 2, 1) for i in np.unique(clusters): plt.scatter(x.iloc[clusters == i, 0], x.iloc[clusters == i, 1], color=colors[i % 3], label='Cluster ' + str(i + 1)) # Plotting the centroids of the clusters plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=100, c='lightskyblue', label='Centroids') plt.legend() plt.title('K-Means Clustering') plt.xlabel(x.columns[0]) plt.ylabel(x.columns[1]) plt.subplot(2, 2, 2) for i in np.unique(df[y].values): plt.scatter(x.iloc[df[y].values == i, 0], x.iloc[df[y].values == i, 1], color=colors[i % 3], label='Cluster ' + str(i + 1)) plt.legend() plt.title('Ground Truth Classification') plt.xlabel(x.columns[0]) plt.ylabel(x.columns[1]) plt.show() # Part 2: Find the optimum number of clusters for k-means if show_elbow: wcss = [] # Trying kmeans for k=1 to k=10 for i in range(1, 11): kmeans = KMeans(n_clusters=i, init='k-means++') kmeans.fit(x) wcss.append(kmeans.inertia_) # Plotting the results onto a line graph, allowing us to observe 'The elbow' plt.plot(range(1, 11), wcss) plt.title('The elbow method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') # within cluster sum of squares plt.show()
def get_results(city, no): processing.preprocessing() pre = open('preprocess1.txt') train_set = [] line = pre.readline() while (line != ''): train_set.append(line) #print line line = pre.readline() #print train_set pos = open('positive-words.txt') neg = open('negative-words.txt') positive = [] negative = [] for i in pos.read().split(): positive.append(i) for j in neg.read().split(): negative.append(j) stopWords = stopwords.words('english') vectorizer = CountVectorizer(stop_words=stopWords) transformer = TfidfTransformer() #train_set=get_traindata() #l=[] #l.append(test_set) #l.append(test_set1) #trainVectorizerArray = vectorizer.fit_transform(train_set).toarray() #print vectorizer.get_feature_names() #testVectorizerArray = vectorizer.transform(test_set).toarray() #testVectorizerArray1 = vectorizer.transform(l[1]).toarray() #print 'Fit Vectorizer to train set', trainVectorizerArray #print 'Transform Vectorizer to test set', testVectorizerArray #print testVectorizerArray1[0] #transformer.fit(trainVectorizerArray) v = vectorizer.fit_transform(train_set) #print v.toarray() tfidf = transformer.fit_transform(v) #transformer.fit(testVectorizerArray) #tfidf = transformer.transform(trainVectorizerArray) #print tfidf.todense() #print("done in %0.3fs." % (time() - t0)) #print nmf.components_ # Inverse the vectorizer vocabulary to be able feature_names = vectorizer.get_feature_names() #print (feature_names) #if 'area' in feature_names: print(feature_names) print("\n") #------- nmf = decomposition.NMF(n_components=3, init='random', random_state=0).fit(tfidf.todense()) topic_list = [] l = int(len(feature_names) / 5) #print l for topic_idx, topic in enumerate(nmf.components_): topic_list.append(topic.argsort()[:-l - 1:-1]) #print("Topic #%d:" % topic_idx) #print (topic) #print "Hello----" #print topic_list train_target = [] for arr in v.toarray(): train_target.append(calculate_Topic(arr, topic_list)) #print train_target #clf = MultinomialNB() #clf2= LinearSVC() #clf1=NearestCentroid() #clf.fit(tfidf.todense(),train_target) #clf1.fit(tfidf.todense(),train_target) #clf2.fit(tfidf.todense(),train_target) #print (clf.predict(X_test)) #print (clf1.predict(X_test)) #print (clf2.predict(X_test)) #print "Hello" ch2 = SelectKBest(chi2, k=l * 2) X_train = ch2.fit_transform(tfidf.todense(), train_target) cs = ch2.scores_.argsort()[::-1] cs_featurenames = [] cs = cs[:l * 2] for x in cs: cs_featurenames.append(feature_names[x]) print(cs_featurenames) print "\n" nmf1 = decomposition.NMF(n_components=3, init='random', random_state=0).fit(X_train) topic_list = [] l = int(len(feature_names) / 5) #print l for topic_idx, topic in enumerate(nmf1.components_): z = topic.argsort()[:-l - 1:-1] topic_list.append(z) print("Topic #%d:---------------------------------------" % topic_idx) for y in z: print cs_featurenames[y] #print (topic) #print "Hello----" #print topic_list train_target = [] for arr in X_train: train_target.append(calculate_Topic(arr, topic_list)) #--------- #print "hello" #print train_target #print ch2.get_feature_names() #print X_train #print train_target #print "=--------------" #print ta #print X_test train_count = [0] * 4 #print train_target for x in train_target: train_count[x] = train_count[x] + 1 #print "hello" #print train_count clf = MultinomialNB() clf2 = LinearSVC() clf1 = NearestCentroid() clf.fit(X_train, train_target) clf1.fit(X_train, train_target) clf2.fit(X_train, train_target) dic = {} hotels = read_hotels(city, dic) temp = [] for each in hotels: temp.append( calculate(vectorizer, transformer, train_count, ch2, each, clf, clf1, clf2, positive, negative, train_set)) res = [] temp1 = numpy.array(temp).argsort()[::-1] #print temp1 print "Top %d recommendations are as follows[in the FORMAT Index,(Hotel name,Location),Score]:\n" % no for g in temp1[:no]: print g, dic[g], temp[g] res.append(dic[g]) return res
# --------------------------------------------------------------------------------------------------------------- # %% Paths path_geotiff = '../Data/images/' path_shp = '../Data/shapefiles/' image_name_train = 'geotiffs/2018_12_29_Tai_bigger.tiff' label_path_train = 'labels/Tai/segmentation.shp' # --------------------------------------------------------------------------------------------------------------- # %% load plantation image band, meta_train = load_geotiff(path_geotiff + image_name_train, window=rasterio.windows.Window( 1490, 4020, 530, 350)) # --------------------------------------------------------------------------------------------------------------- # %% Generate new features img_train = preprocessing(np.stack(band, axis=2)) # %% vizualize new features Nband_kept = 10 - 1 fig, axs = plt.subplots(4, 4, figsize=(12, 10), gridspec_kw={ 'hspace': 0.1, 'wspace': 0.1 }) show_image(img_train[:, :, Nband_kept + 1], meta_train['transform'], ax=axs[0][0], cmap='PiYG') axs[0][0].set_title('NDVI', fontsize=10)
temp_dict = dict() temp_dict['user_rating'] = curr_rat['user_rating'] temp_dict['user_rating_date'] = curr_rat['user_rating_date'] temp_dict['user_id'] = 'x' + curr_rat['user_id'] curr_rats.append(temp_dict) corrected_ratings[x] = curr_rats ratings = corrected_ratings corrected_films = dict() for x in films.keys(): if x not in ids_to_del: corrected_films[x] = films[x] films = corrected_films assert len(ratings) == len(films) films, ratings_dict, compressed_test_ratings_dict, sims, movies_all_genres_matrix, movies_all_directors_matrix, movies_all_actors_matrix = preprocessing( ratings, films, movielens_data) start = time.time() MUR = 0.1 MUG = 0.6 MUA = 0.1 MUD = 0.1 nr_predictions, accuracy, rmse, mae, precision, recall, f1 = predictions( MUR, MUG, MUA, MUD, films, compressed_test_ratings_dict, ratings_dict, sims, movies_all_genres_matrix, movies_all_directors_matrix, movies_all_actors_matrix, movielens_data) # print results print("Number of user-items pairs: %d" % nr_predictions) print("Accuracy: %.2f " % accuracy)
sns.barplot(y=res['Processing'], x=res['Predict Time'], hue=res.model, data=res, orient='h', ax=axs[2]) plt.legend(fontsize='small') plt.show() # Regression: Test with wine dataset using linear regression and neural network # Test with one hot encoded mushroom dataset using linear regression df = pd.read_csv('data/wine/winequality-red.csv') y = 'quality' df = preprocessing(data=df, y=df[y], perform_scale=True) print( linear_regression(df.loc[:, df.columns.difference(['quality'])], df['quality'], log_result=False)) print( neural_network(df, df[y], is_regression=True, log_result=False, epochs=20)) df = pd.read_csv('data/mushroom/mushrooms.csv') y = 'class' df = df.dropna() df = df.reset_index(drop=True) df = preprocessing(data=df, y=df[y], perform_scale=True, perform_ohe=True,
def get_results(city,no): processing.preprocessing() pre=open('preprocess1.txt') train_set=[] line=pre.readline() while(line!=''): train_set.append(line) #print line line=pre.readline() #print train_set pos=open('positive-words.txt') neg=open('negative-words.txt') positive=[] negative=[] for i in pos.read().split(): positive.append(i) for j in neg.read().split(): negative.append(j) stopWords = stopwords.words('english') vectorizer = CountVectorizer(stop_words = stopWords) transformer = TfidfTransformer() #train_set=get_traindata() #l=[] #l.append(test_set) #l.append(test_set1) #trainVectorizerArray = vectorizer.fit_transform(train_set).toarray() #print vectorizer.get_feature_names() #testVectorizerArray = vectorizer.transform(test_set).toarray() #testVectorizerArray1 = vectorizer.transform(l[1]).toarray() #print 'Fit Vectorizer to train set', trainVectorizerArray #print 'Transform Vectorizer to test set', testVectorizerArray #print testVectorizerArray1[0] #transformer.fit(trainVectorizerArray) v= vectorizer.fit_transform(train_set) #print v.toarray() tfidf= transformer.fit_transform(v) #transformer.fit(testVectorizerArray) #tfidf = transformer.transform(trainVectorizerArray) #print tfidf.todense() #print("done in %0.3fs." % (time() - t0)) #print nmf.components_ # Inverse the vectorizer vocabulary to be able feature_names = vectorizer.get_feature_names() #print (feature_names) #if 'area' in feature_names: print (feature_names) print ("\n") #------- nmf = decomposition.NMF(n_components=3, init='random',random_state=0).fit(tfidf.todense()) topic_list=[] l= int(len(feature_names)/5) #print l for topic_idx, topic in enumerate(nmf.components_): topic_list.append(topic.argsort()[:-l-1:-1]) #print("Topic #%d:" % topic_idx) #print (topic) #print "Hello----" #print topic_list train_target=[] for arr in v.toarray(): train_target.append(calculate_Topic(arr,topic_list)) #print train_target #clf = MultinomialNB() #clf2= LinearSVC() #clf1=NearestCentroid() #clf.fit(tfidf.todense(),train_target) #clf1.fit(tfidf.todense(),train_target) #clf2.fit(tfidf.todense(),train_target) #print (clf.predict(X_test)) #print (clf1.predict(X_test)) #print (clf2.predict(X_test)) #print "Hello" ch2 = SelectKBest(chi2, k=l*2) X_train = ch2.fit_transform(tfidf.todense(), train_target) cs= ch2.scores_.argsort()[::-1] cs_featurenames=[] cs=cs[:l*2] for x in cs: cs_featurenames.append(feature_names[x]) print (cs_featurenames) print "\n" nmf1 = decomposition.NMF(n_components=3, init='random',random_state=0).fit(X_train) topic_list=[] l= int(len(feature_names)/5) #print l for topic_idx, topic in enumerate(nmf1.components_): z=topic.argsort()[:-l-1:-1] topic_list.append(z) print("Topic #%d:---------------------------------------" % topic_idx) for y in z: print cs_featurenames[y] #print (topic) #print "Hello----" #print topic_list train_target=[] for arr in X_train: train_target.append(calculate_Topic(arr,topic_list)) #--------- #print "hello" #print train_target #print ch2.get_feature_names() #print X_train #print train_target #print "=--------------" #print ta #print X_test train_count=[0]*4 #print train_target for x in train_target: train_count[x]=train_count[x]+1 #print "hello" #print train_count clf = MultinomialNB() clf2= LinearSVC() clf1=NearestCentroid() clf.fit(X_train,train_target) clf1.fit(X_train,train_target) clf2.fit(X_train,train_target) dic={} hotels=read_hotels(city,dic) temp=[] for each in hotels: temp.append(calculate(vectorizer,transformer,train_count,ch2,each,clf,clf1,clf2,positive,negative,train_set)) res=[] temp1=numpy.array(temp).argsort()[::-1] #print temp1 print "Top %d recommendations are as follows[in the FORMAT Index,(Hotel name,Location),Score]:\n" %no for g in temp1[:no]: print g,dic[g],temp[g] res.append(dic[g]) return res
def infer_on_stream(args, client): """ Initialize the inference network, stream video to network, and output stats and video. :param args: Command line arguments parsed by `build_argparser()` :param client: MQTT client :return: None """ # Initialise the class infer_network = Network() # Set Probability threshold for detections # prob_threshold = args.prob_threshold ### Load the model through `infer_network` ### infer_network.load_model(args.model) # Get input shape input_shape = infer_network.get_input_shape() ### Handle the input stream ### input_stream = VideoCapture(args.input) input_stream.open(args.input) # Out stream setup fourcc = VideoWriter_fourcc(*'mp4v') frames = 24 # Grab the shape of the input, since it's requiered for cv.VideoWriter # without using it cv gets a buffer size error and crashes width = int(input_stream.get(3)) height = int(input_stream.get(4)) out = VideoWriter('/app/out/out.mp4', fourcc, frames, (width, height)) ### Loop until stream is over ### while input_stream.isOpened(): ### Read from the video capture ### flag, frame = input_stream.read() if not flag: break key_pressed = waitKey(60) ### Pre-process the image as needed ### preprocessed_frame = preprocessing(frame, input_shape) ### Start asynchronous inference for specified request ### infer_network.exec_net(preprocessed_frame, 0) ### Wait for the result ### # Paramater is request number not wait time status = infer_network.wait(0) ### Get the results of the inference request ### if status == 0: output_shape = infer_network.get_output(0) drawn_frame = draw_boxes(frame, output_shape, args, width, height) out.write(drawn_frame) sys.stdout.buffer.write(drawn_frame) sys.stdout.flush() ### TODO: Extract any desired stats from the results ### ### TODO: Calculate and send relevant information on ### ### current_count, total_count and duration to the MQTT server ### ### Topic "person": keys of "count" and "total" ### ### Topic "person/duration": key of "duration" ### ### TODO: Send the frame to the FFMPEG server ### ### Write an output image if `single_image_mode` ### # imwrite('out/output.png', frame) # Break if escape key pressed if key_pressed == 27: break # Release the out writer, capture, and destroy any OpenCV windows out.release() input_stream.release() destroyAllWindows()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri May 17 20:23:57 2019 @author: pushap """ import numpy as np from processing import preprocessing path="/home/pushap/newdata/capturedImages/" x,y=preprocessing(path) np.save('x.npy',x) np.save('y.npy',y)
if data_type != 'netflix': # remove from ratings the missing films (that were missing info and hence were discarded) ids_to_del_rf = set(ratings.keys()).difference(set(films.keys())) ids_to_del_fr = set(films.keys()).difference(set(ratings.keys())) ids_to_del = ids_to_del_rf.union(ids_to_del_fr) corrected_ratings = dict() for x in ratings.keys(): if x not in ids_to_del: curr_rats = [] for curr_rat in ratings[x]: temp_dict = dict() temp_dict['user_rating'] = curr_rat['user_rating'] temp_dict['user_rating_date'] = curr_rat[ 'user_rating_date'] temp_dict['user_id'] = 'x' + curr_rat['user_id'] curr_rats.append(temp_dict) corrected_ratings[x] = curr_rats ratings = corrected_ratings corrected_films = dict() for x in films.keys(): if x not in ids_to_del: corrected_films[x] = films[x] films = corrected_films assert len(ratings) == len(films) _, ratings_dict, compressed_test_ratings_dict, _, _, _, _ = preprocessing( ratings, films, data_type) run_baselines(ratings_dict, compressed_test_ratings_dict, data_type)
metas = [] bounds = [] for window, filename in zip(windows, filenames): print(f'>>> loading {window} from {filename}') band, meta = load_geotiff(path_geotiff + filename, window=window) img_adj.append(contrast_adjust(band, (0.5, 99.5))) img.append(np.stack(band, axis=2)) metas.append(meta) bounds.append( rasterio.transform.array_bounds(meta['height'], meta['width'], meta['transform'])) # %% ------------------------------------------------------------------------------------------------------------- # preprocess images img = [preprocessing(im) for im in img] # %% ------------------------------------------------------------------------------------------------------------- # reshape in 2D X = [im.reshape(-1, im.shape[2]) for im in img] # %% ------------------------------------------------------------------------------------------------------------- # Load the fitted KNN models with open('../models/KNN_trained.pickle', 'rb') as src: fitted_model = pickle.load(src) # %% ------------------------------------------------------------------------------------------------------------- # Predict all sample and post-process them preds = [] for i, Xi in enumerate(X): print(f'>>> Detecting cocoa plantations on image {i+1}')