def efa(nfact, rotation="none", method="ml", cor=True, period_min=60): # print "Dimension of x matrix:\n%s" % str(ro.r('dim(x)')) # print "Head of x matrix:\n%s" % str(ro.r('head(x)')) # ro.r('ev <- eigen(cor(x))') # get eigenvalues # ro.r('ap <- parallel(subject=nrow(x),var=ncol(x),rep=100,cent=.05)') # ro.r('nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)') # ro.r('plotnScree(nS)') # compute num factors print "\n\n\n***Performing factorial analysis with " + method + " fitting method, " + rotation + " rotational method...\n" ro.r('fit <- fa(r=x,nfactors=' + nfact + ',n.iter=1, min.err = 0.001, max.iter = 50, \ rotate="' + rotation + '", scores="regression", \ residuals=TRUE, SMC=TRUE, missing=FALSE,impute="median",\ warnings=TRUE, fm="' + method + '",\ alpha=.1,p=.05,oblique.scores=FALSE,use="pairwise", \ covar= ' + ("F" if cor else "T") + ', cor = "' + ("cor" if cor else "cov") + '")') print "\nPrinting a summary of the fitting matrix with method " + method + " and rotation " + rotation + "...\n" ro.r('summary(fit)') # print variance accounted for print("\n") print "\nPrinting the fitting matrix with method " + method + " and rotation " + rotation + "...\n" ro.r('print(fit)') print("\n") print "\nPrinting the loadings matrix with method " + method + " and " + rotation + " rotation...\n" ro.r('print(loadings(fit), digits=2, cutoff=0.4)') print("\n") loadings = np.asarray(ro.r('fit$loadings')) factor_names = np.asarray(ro.r('colnames(fit$loadings)')) services_names = np.asarray(ro.r('rownames(fit$loadings)')) traffic_matrix = np.asarray(ro.r('x')) traffic_snapshots = np.asarray(ro.r('rownames(x)')) traffic_services = np.asarray(ro.r('colnames(x)')) signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \ traffic_services, loadings, factor_names, label, output_folder, method + "_" + rotation) all_dict = {} temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min) all_dict["signatures"] = temp_dict temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, normalized_y=True) all_dict["norm_signatures"] = temp_dict signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \ traffic_services, loadings, factor_names, label, output_folder, method + "_" + rotation, threshold=0.4) temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, threshold=0.4) all_dict["signatures_th_0.4"] = temp_dict temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, normalized_y=True, threshold=0.4) all_dict["norm_signatures_th_0.4"] = temp_dict signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \ traffic_services, loadings, factor_names, label, output_folder, method + "_" + rotation, threshold=0.6) temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, threshold=0.6) all_dict["signatures_th_0.6"] = temp_dict temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, normalized_y=True, threshold=0.6) all_dict["norm_signatures_th_0.6"] = temp_dict signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \ traffic_services, loadings, factor_names, label, output_folder, method + "_" + rotation, threshold=0.8) temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, threshold=0.8) all_dict["signatures_th_0.8"] = temp_dict temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, normalized_y=True, threshold=0.8) all_dict["norm_signatures_th_0.8"] = temp_dict # Analyzing scores scores = np.asarray(ro.r('fit$scores')) population_names = np.asarray(ro.r('rownames(fit$scores)')) score_factor_names = np.asarray(ro.r('colnames(fit$scores)')) scores_df = analyze_scores(scores, population_names, score_factor_names, method + "_" + rotation) print "Plotting scores (" + method + " and rotation " + rotation + ")...\n" plot_factor_scores(factor_names, scores_df, method + "_" + rotation, False) plot_factor_scores(factor_names, scores_df, method + "_" + rotation, True) temp_dict = utilities.plot_time_signatures(scores_df[factor_names].as_matrix(), scores_df["snapshot_name"], factor_names, label, output_folder, method + "_" + rotation + "/scores/", period_min) all_dict["scores"] = temp_dict plot_data_by_factor(all_dict, factor_names, label, output_folder, method + "_" + rotation + "/summary_plots", period_min) # Analyzing loadings print "Saving loadings to file (" + method + " and rotation " + rotation + ")...\n" loadings_df = analyze_loadings(loadings, factor_names, services_names, method + "_" + rotation) print "Plotting loadings (" + method + " and rotation " + rotation + ")...\n" plot_factor_loadings(factor_names, loadings_df, method + "_" + rotation, False) plot_factor_loadings(factor_names, loadings_df, method + "_" + rotation, True) # Analyzing uniqueness print "\nPrinting the uniqueness matrix with method " + method + " and " + rotation + " rotation...\n" ro.r('print(fit$uniquenesses)') print("\n") uniquenesses = np.asarray(ro.r('fit$uniquenesses')) uniquenesses_names = np.asarray(ro.r('names(fit$uniquenesses)')) print "Saving uniquenesses to file (" + method + " and rotation " + rotation + ")...\n" uniquenesses_df = analyze_uniquenesses(uniquenesses, uniquenesses_names, method + "_" + rotation) print "Plotting uniquenesses (" + method + " and rotation " + rotation + ")...\n" plot_factor_loadings(["uniquenesses"], uniquenesses_df, method + "_" + rotation, False, True) return loadings_df, scores_df, uniquenesses_df
def nnmf(nfact, method="scd", loss="mse", period_min=60): print "\n\n\n***Performing non-negative-matrix factorization with " + method + " method and " + loss + " loss function...\n" ro.r('fit<-nnmf(x, ' + nfact + ', max.iter = 100000, rel.tol = -1, method = "' + method + '",inner.max.iter = 1, loss="' + loss + '", n.threads=20);') print "\nPrinting a summary of the fitting matrix with method " + method + " and loss " + loss + "...\n" ro.r('summary(fit)') # print variance accounted for print("\n") print "\nPrinting the fitting matrix with method " + method + " and loss " + loss + "...\n" ro.r('print(fit)') print("\n") print "\nPrinting the score matrix with method " + method + " and loss " + loss + "...\n" ro.r('print(fit$W, digits=2, cutoff=0.4)') print("\n") print "\nPrinting the factor matrix with method " + method + " and loss " + loss + "...\n" ro.r('print(t(fit$H), digits=2, cutoff=0.4)') print("\n") print "\nPlotting convergence to epoch with method " + method + " and loss " + loss + "...\n" ro.r('jpeg(file = "' + output_folder + "/" + method + "_" + loss + '_convergence.jpeg")') ro.r('plot(NULL, xlim = c(1, 3000), ylim = c(0.15, 0.45), xlab = "Epochs", ylab = "' + loss + '");') ro.r('lines(cumsum(fit$average.epochs), fit$' + loss + ');') ro.r('dev.off()') print("\n") print "\nPrinting the loss and the cumsum with method " + method + " and loss " + loss + "...\n" ro.r('print(fit$loss, digits=2, cutoff=0.4)') ro.r('print(cumsum(fit$average.epochs), digits=2, cutoff=0.4)') print("\n") print "\nPlotting the heatmaps with method " + method + " and loss " + loss + "...\n" ro.r('jpeg(file = "' + output_folder + "/" + method + "_" + loss + '_heatmap_W.jpeg")') ro.r('heatmap(fit$W, Colv = NA, xlab = "samples", ylab = "factors", margins = c(2,2),'+\ 'labRow = "", labCol = "", scale = "column", col = cm.colors(100));') ro.r('dev.off()') ro.r('jpeg(file = "' + output_folder + "/" + method + "_" + loss + '_heatmap_H.jpeg")') ro.r('heatmap(fit$H, Rowv = NA, ylab = "variables", xlab = "factors", margins = c(2,2),'+\ 'labRow = '', labCol = '', scale = "row", col = cm.colors(100));') ro.r('dev.off()') print("\n") loadings = np.asarray(ro.r('t(fit$H)')) print "Loadings are: " + str(loadings) factor_names = np.asarray([method + str(i) for i in range(loadings.shape[1])]) print "Factor names are: " + str(factor_names) services_names = np.asarray(ro.r('colnames(fit$H)')) print "Service names are: " + str(services_names) traffic_matrix = np.asarray(ro.r('x')) traffic_snapshots = np.asarray(ro.r('rownames(x)')) traffic_services = np.asarray(ro.r('colnames(x)')) signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \ traffic_services, loadings, factor_names, label, output_folder, method + "_" + loss) all_dict = {} temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min) all_dict["signatures"] = temp_dict temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, normalized_y=True) all_dict["norm_signatures"] = temp_dict signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \ traffic_services, loadings, factor_names, label, output_folder, method + "_" + loss, threshold=0.4, max_factor_value=True) temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, threshold=0.4, max_factor_value=True) all_dict["signatures_th_0.4"] = temp_dict temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, normalized_y=True, threshold=0.4, max_factor_value=True) all_dict["norm_signatures_th_0.4"] = temp_dict signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \ traffic_services, loadings, factor_names, label, output_folder, method + "_" + loss, threshold=0.6, max_factor_value=True) temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, threshold=0.6, max_factor_value=True) all_dict["signatures_th_0.6"] = temp_dict temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, normalized_y=True, threshold=0.6, max_factor_value=True) all_dict["norm_signatures_th_0.6"] = temp_dict signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \ traffic_services, loadings, factor_names, label, output_folder, method + "_" + loss, threshold=0.8, max_factor_value=True) temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, threshold=0.8, max_factor_value=True) all_dict["signatures_th_0.8"] = temp_dict temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, normalized_y=True, threshold=0.8, max_factor_value=True) all_dict["norm_signatures_th_0.8"] = temp_dict # Analyzing scores scores = np.asarray(ro.r('fit$W')) print "Scores are: " + str(scores) population_names = np.asarray(ro.r('rownames(fit$W)')) print "Population names are: " + str(population_names) score_factor_names = np.asarray([method + str(i) for i in range(loadings.shape[1])]) print "Score factor names are: " + str(score_factor_names) scores_df = analyze_scores(scores, population_names, score_factor_names, method + "_" + loss) print "Plotting scores (" + method + " and loss " + loss + ")...\n" plot_factor_scores(factor_names, scores_df, method + "_" + loss, False) plot_factor_scores(factor_names, scores_df, method + "_" + loss, True) temp_dict = utilities.plot_time_signatures(scores_df[factor_names].as_matrix(), scores_df["snapshot_name"], factor_names, label, output_folder, method + "_" + loss + "/scores/", period_min) all_dict["scores"] = temp_dict plot_data_by_factor(all_dict, factor_names, label, output_folder, method + "_" + loss + "/summary_plots", period_min) #plot_data_by_factor(all_dict, factor_names, label, output_folder, method + "_" + loss + "/summary_plots", period_min) # Analyzing loadings print "Saving loadings to file (" + method + " and loss " + loss + ")...\n" loadings_df = analyze_loadings(loadings, factor_names, services_names, method + "_" + loss) return loadings_df, scores_df
def efa(snapshot_filename, nfact, output_folder, rotation="none", method="ml", cor=True, period_min=60): #print "Dimension of x matrix:\n%s" % str(ro.r('dim(x)')) #print "Head of x matrix:\n%s" % str(ro.r('head(x)')) #ro.r('ev <- eigen(cor(x))') # get eigenvalues #ro.r('ap <- parallel(subject=nrow(x),var=ncol(x),rep=100,cent=.05)') #ro.r('nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)') #ro.r('plotnScree(nS)') #compute num factors print "\n\n\n***Performing factorial analysis with " + method + " fitting method, " + rotation + " rotational method...\n" ro.r('fit <- fa(r=x,nfactors=' + nfact + ',n.iter=1, min.err = 0.001, max.iter = 50, \ rotate="' + rotation + '", scores="regression", \ residuals=TRUE, SMC=TRUE, missing=FALSE,impute="median",\ warnings=TRUE, fm="' + method + '",\ alpha=.1,p=.05,oblique.scores=FALSE,use="pairwise", \ covar= ' + ("F" if cor else "T") + ', cor = "' + ("cor" if cor else "cov") + '")') print "\nPrinting a summary of the fitting matrix with method " + method + " and rotation " + rotation + "...\n" ro.r('summary(fit)') # print variance accounted for print("\n") print "\nPrinting the fitting matrix with method " + method + " and rotation " + rotation + "...\n" ro.r('print(fit)') print("\n") print "\nPrinting the loadings matrix with method " + method + " and " + rotation + " rotation...\n" ro.r('print(loadings(fit), digits=2, cutoff=0.4)') print("\n") loadings = np.asarray(ro.r('fit$loadings')) factor_names = np.asarray(ro.r('colnames(fit$loadings)')) cell_ids = np.asarray(ro.r('rownames(fit$loadings)')) traffic_matrix = np.asarray(ro.r('x')) traffic_snapshots = np.asarray(ro.r('rownames(x)')) traffic_base_stations = np.asarray(ro.r('colnames(x)')) full_scores_df = pd.DataFrame() for c in CATEGORIES: searched_category = ',' + c + ',' #2016-09-05.Monday,00:00:00,NI, output_subfolders = output_folder + "/" + method + "_" + rotation + "/" + c + "/" if not os.path.exists(os.path.dirname(output_subfolders)): os.makedirs(os.path.dirname(output_subfolders)) found_elements = [(i, (v.split(",")[0]+','+v.split(",")[1])) for i, v in enumerate(traffic_snapshots) if searched_category in v] if len(found_elements) == 0: print("*** PROBLEM HERE: no element for category " + c) continue indices, c_traffic_snapshots = zip(*found_elements) indices = list(indices) c_traffic_snapshots = list(c_traffic_snapshots) c_traffic_matrix = traffic_matrix[indices] #print("*** For category " + c + "\nFound elements: " + str(found_elements) + "\nIndices: " + str(indices) + "\nc_traffic_matrix: " + str(c_traffic_matrix) + "\nCreated output folder: " + output_folder) signatures, signatures_df, _ = utilities.generate_signatures(c_traffic_matrix, c_traffic_snapshots, \ traffic_base_stations, loadings, factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/") utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min) utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, normalized_y=True) signatures, signatures_df, _ = utilities.generate_signatures(c_traffic_matrix, c_traffic_snapshots, \ traffic_base_stations, loadings, factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", threshold=0.4) utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, threshold=0.4) utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, normalized_y=True, threshold=0.4) signatures, signatures_df, _ = utilities.generate_signatures(c_traffic_matrix, c_traffic_snapshots, \ traffic_base_stations, loadings, factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", threshold=0.6) utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, threshold=0.6) utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, normalized_y=True, threshold=0.6) signatures, signatures_df, _ = utilities.generate_signatures(c_traffic_matrix, c_traffic_snapshots, \ traffic_base_stations, loadings, factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", threshold=0.8) utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, threshold=0.8) utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, normalized_y=True, threshold=0.8) #Analyzing scores scores = np.asarray(ro.r('fit$scores')) population_names = np.asarray(ro.r('rownames(fit$scores)')) score_factor_names = np.asarray(ro.r('colnames(fit$scores)')) found_elements = [(i, (v.split(",")[0]+','+v.split(",")[1])) for i, v in enumerate(population_names) if searched_category in v] if len(found_elements) == 0: print("*** PROBLEM HERE: no element for category " + c) continue indices, c_traffic_snapshots = zip(*found_elements) indices = list(indices) c_population_names = list(c_traffic_snapshots) c_scores = scores[indices] scores_df = analyze_scores(c_scores, c_population_names, score_factor_names, output_folder, method + "_" + rotation + "/" + c + "/") scores_df["category"] = c full_scores_df = full_scores_df.append(scores_df, ignore_index=True) print "Plotting scores (" + method + " and rotation " + rotation + ")...\n" plot_factor_scores(factor_names, scores_df, output_folder, method + "_" + rotation + "/" + c + "/", False) plot_factor_scores(factor_names, scores_df, output_folder, method + "_" + rotation + "/" + c + "/", True) #Analyzing loadings print "Saving loadings to file (" + method + " and rotation " + rotation + ")...\n" loadings_df = analyze_loadings(loadings, factor_names, cell_ids, output_folder, method + "_" + rotation + "/" + c + "/") #Analyzing uniqueness print "\nPrinting the uniqueness matrix with method " + method + " and " + rotation + " rotation...\n" ro.r('print(fit$uniquenesses)') print("\n") uniquenesses = np.asarray(ro.r('fit$uniquenesses')) uniquenesses_names = np.asarray(ro.r('names(fit$uniquenesses)')) print "Saving uniquenesses to file (" + method + " and rotation " + rotation + ")...\n" uniquenesses_df = analyze_uniquenesses(uniquenesses, uniquenesses_names, output_folder, method + "_" + rotation + "/" + c + "/") output_filename = output_folder + "/" + method + "_" + rotation + "/all_loadings" + ("_" + label if label is not None else "") + ".csv" loadings_df.to_csv(output_filename, header=True, sep = ",", index=False) output_filename = output_folder + "/" + method + "_" + rotation + "/all_scores" + ("_" + label if label is not None else "") + ".csv" full_scores_df.to_csv(output_filename, header=True, sep = ",", index=False) output_filename = output_folder + "/" + method + "_" + rotation + "/all_uniquenesses" + ("_" + label if label is not None else "") + ".csv" uniquenesses_df.to_csv(output_filename, header=True, sep = ",", index=False) return loadings_df, full_scores_df, uniquenesses_df