示例#1
0
def efa(nfact, rotation="none", method="ml", cor=True, period_min=60):
    # print "Dimension of x matrix:\n%s" % str(ro.r('dim(x)'))
    # print "Head of x matrix:\n%s" % str(ro.r('head(x)'))
    # ro.r('ev <- eigen(cor(x))') # get eigenvalues
    # ro.r('ap <- parallel(subject=nrow(x),var=ncol(x),rep=100,cent=.05)')
    # ro.r('nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)')
    # ro.r('plotnScree(nS)')
    # compute num factors
    
    print "\n\n\n***Performing factorial analysis with " + method + " fitting method, " + rotation + " rotational method...\n"
    ro.r('fit <- fa(r=x,nfactors=' + nfact + ',n.iter=1, min.err = 0.001,  max.iter = 50, \
            rotate="' + rotation + '", scores="regression", \
            residuals=TRUE, SMC=TRUE, missing=FALSE,impute="median",\
            warnings=TRUE, fm="' + method + '",\
            alpha=.1,p=.05,oblique.scores=FALSE,use="pairwise", \
            covar= ' + ("F" if cor else "T") + ', cor = "' + ("cor" if cor else "cov") + '")')
         
    print "\nPrinting a summary of the fitting matrix with method " + method + " and rotation " + rotation + "...\n" 
    ro.r('summary(fit)')  # print variance accounted for 
    print("\n")
    
    print "\nPrinting the fitting matrix with method " + method + " and rotation " + rotation + "...\n" 
    ro.r('print(fit)')
    print("\n")
    
    print "\nPrinting the loadings matrix with method " + method + " and " + rotation + " rotation...\n" 
    ro.r('print(loadings(fit), digits=2, cutoff=0.4)')
    print("\n")
    
    loadings = np.asarray(ro.r('fit$loadings'))
    factor_names = np.asarray(ro.r('colnames(fit$loadings)'))
    services_names = np.asarray(ro.r('rownames(fit$loadings)'))
    
    traffic_matrix = np.asarray(ro.r('x'))
    traffic_snapshots = np.asarray(ro.r('rownames(x)'))
    traffic_services = np.asarray(ro.r('colnames(x)'))
    signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \
                                                 traffic_services, loadings, factor_names, label, output_folder, method + "_" + rotation)
    
    all_dict = {}
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min)
    all_dict["signatures"] = temp_dict
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, normalized_y=True)
    all_dict["norm_signatures"] = temp_dict
    
    signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \
                                                 traffic_services, loadings, factor_names, label, output_folder, method + "_" + rotation, threshold=0.4)
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, threshold=0.4)
    all_dict["signatures_th_0.4"] = temp_dict
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, normalized_y=True, threshold=0.4)
    all_dict["norm_signatures_th_0.4"] = temp_dict
    
    signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \
                                                 traffic_services, loadings, factor_names, label, output_folder, method + "_" + rotation, threshold=0.6)
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, threshold=0.6)
    all_dict["signatures_th_0.6"] = temp_dict
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, normalized_y=True, threshold=0.6)
    all_dict["norm_signatures_th_0.6"] = temp_dict
    
    signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \
                                                 traffic_services, loadings, factor_names, label, output_folder, method + "_" + rotation, threshold=0.8)
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, threshold=0.8)
    all_dict["signatures_th_0.8"] = temp_dict
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/signatures/", period_min, normalized_y=True, threshold=0.8)
    all_dict["norm_signatures_th_0.8"] = temp_dict
    
    # Analyzing scores
    scores = np.asarray(ro.r('fit$scores'))
    population_names = np.asarray(ro.r('rownames(fit$scores)'))
    score_factor_names = np.asarray(ro.r('colnames(fit$scores)'))
    scores_df = analyze_scores(scores, population_names, score_factor_names, method + "_" + rotation)
    
    print "Plotting scores (" + method + " and rotation " + rotation + ")...\n"
    plot_factor_scores(factor_names, scores_df, method + "_" + rotation, False)
    plot_factor_scores(factor_names, scores_df, method + "_" + rotation, True)
    temp_dict = utilities.plot_time_signatures(scores_df[factor_names].as_matrix(), scores_df["snapshot_name"], factor_names, label, output_folder, method + "_" + rotation + "/scores/", period_min)
    all_dict["scores"] = temp_dict
    
    plot_data_by_factor(all_dict, factor_names, label, output_folder, method + "_" + rotation + "/summary_plots", period_min)
    
    # Analyzing loadings
    print "Saving loadings to file (" + method + " and rotation " + rotation + ")...\n"
    loadings_df = analyze_loadings(loadings, factor_names, services_names, method + "_" + rotation)

    print "Plotting loadings (" + method + " and rotation " + rotation + ")...\n"
    plot_factor_loadings(factor_names, loadings_df, method + "_" + rotation, False)
    plot_factor_loadings(factor_names, loadings_df, method + "_" + rotation, True)

    # Analyzing uniqueness
    print "\nPrinting the uniqueness matrix with method " + method + " and " + rotation + " rotation...\n" 
    ro.r('print(fit$uniquenesses)')
    print("\n")
    
    uniquenesses = np.asarray(ro.r('fit$uniquenesses'))
    uniquenesses_names = np.asarray(ro.r('names(fit$uniquenesses)'))
    
    print "Saving uniquenesses to file (" + method + " and rotation " + rotation + ")...\n"
    uniquenesses_df = analyze_uniquenesses(uniquenesses, uniquenesses_names, method + "_" + rotation)

    print "Plotting uniquenesses (" + method + " and rotation " + rotation + ")...\n"
    plot_factor_loadings(["uniquenesses"], uniquenesses_df, method + "_" + rotation, False, True)
    
    return loadings_df, scores_df, uniquenesses_df
示例#2
0
def nnmf(nfact, method="scd", loss="mse", period_min=60):    
    print "\n\n\n***Performing non-negative-matrix factorization with " + method + " method and " + loss + " loss function...\n"
    ro.r('fit<-nnmf(x, ' + nfact + ', max.iter = 100000, rel.tol = -1, method = "' + method + '",inner.max.iter = 1, loss="' + loss + '", n.threads=20);')
         
    print "\nPrinting a summary of the fitting matrix with method " + method + " and loss " + loss + "...\n" 
    ro.r('summary(fit)')  # print variance accounted for 
    print("\n")
    
    print "\nPrinting the fitting matrix with method " + method + " and loss " + loss + "...\n" 
    ro.r('print(fit)')
    print("\n")
    
    print "\nPrinting the score matrix with method " + method + " and loss " + loss + "...\n" 
    ro.r('print(fit$W, digits=2, cutoff=0.4)')
    print("\n")
    
    print "\nPrinting the factor matrix with method " + method + " and loss " + loss + "...\n" 
    ro.r('print(t(fit$H), digits=2, cutoff=0.4)')
    print("\n")
    
    print "\nPlotting convergence to epoch with method " + method + " and loss " + loss + "...\n" 
    ro.r('jpeg(file = "' + output_folder + "/" + method + "_" + loss + '_convergence.jpeg")')
    ro.r('plot(NULL, xlim = c(1, 3000), ylim = c(0.15, 0.45), xlab = "Epochs", ylab = "' + loss + '");')
    ro.r('lines(cumsum(fit$average.epochs), fit$' + loss + ');')
    ro.r('dev.off()')
    print("\n")
    
    print "\nPrinting the loss and the cumsum with method " + method + " and loss " + loss + "...\n" 
    ro.r('print(fit$loss, digits=2, cutoff=0.4)')
    ro.r('print(cumsum(fit$average.epochs), digits=2, cutoff=0.4)')
    print("\n")
    
    print "\nPlotting the heatmaps with method " + method + " and loss " + loss + "...\n" 
    ro.r('jpeg(file = "' + output_folder + "/" + method + "_" + loss + '_heatmap_W.jpeg")')
    ro.r('heatmap(fit$W, Colv = NA, xlab = "samples", ylab = "factors", margins = c(2,2),'+\
                'labRow = "", labCol = "", scale = "column", col = cm.colors(100));')
    ro.r('dev.off()')
    ro.r('jpeg(file = "' + output_folder + "/" + method + "_" + loss + '_heatmap_H.jpeg")')
    ro.r('heatmap(fit$H, Rowv = NA, ylab = "variables", xlab = "factors", margins = c(2,2),'+\
                'labRow = '', labCol = '', scale = "row", col = cm.colors(100));')
    ro.r('dev.off()')
    print("\n")
    
    loadings = np.asarray(ro.r('t(fit$H)'))
    print "Loadings are: " + str(loadings)
    factor_names = np.asarray([method + str(i) for i in range(loadings.shape[1])])
    print "Factor names are: " + str(factor_names)
    services_names = np.asarray(ro.r('colnames(fit$H)'))
    print "Service names are: " + str(services_names)
    
    traffic_matrix = np.asarray(ro.r('x'))
    traffic_snapshots = np.asarray(ro.r('rownames(x)'))
    traffic_services = np.asarray(ro.r('colnames(x)'))
    signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \
                                                 traffic_services, loadings, factor_names, label, output_folder, method + "_" + loss)
    
    all_dict = {}
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min)
    all_dict["signatures"] = temp_dict
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, normalized_y=True)
    all_dict["norm_signatures"] = temp_dict
    
    signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \
                                                 traffic_services, loadings, factor_names, label, output_folder, method + "_" + loss, threshold=0.4, max_factor_value=True)
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, threshold=0.4, max_factor_value=True)
    all_dict["signatures_th_0.4"] = temp_dict
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, normalized_y=True, threshold=0.4, max_factor_value=True)
    all_dict["norm_signatures_th_0.4"] = temp_dict
    
    signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \
                                                 traffic_services, loadings, factor_names, label, output_folder, method + "_" + loss, threshold=0.6, max_factor_value=True)
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, threshold=0.6, max_factor_value=True)
    all_dict["signatures_th_0.6"] = temp_dict
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, normalized_y=True, threshold=0.6, max_factor_value=True)
    all_dict["norm_signatures_th_0.6"] = temp_dict
    
    signatures, signatures_df, _ = utilities.generate_signatures(traffic_matrix, traffic_snapshots, \
                                                 traffic_services, loadings, factor_names, label, output_folder, method + "_" + loss, threshold=0.8, max_factor_value=True)
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, threshold=0.8, max_factor_value=True)
    all_dict["signatures_th_0.8"] = temp_dict
    temp_dict = utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + loss + "/signatures/", period_min, normalized_y=True, threshold=0.8, max_factor_value=True)
    all_dict["norm_signatures_th_0.8"] = temp_dict
    
    # Analyzing scores
    scores = np.asarray(ro.r('fit$W'))
    print "Scores are: " + str(scores)
    population_names = np.asarray(ro.r('rownames(fit$W)'))
    print "Population names are: " + str(population_names)
    score_factor_names = np.asarray([method + str(i) for i in range(loadings.shape[1])])
    print "Score factor names are: " + str(score_factor_names)
    scores_df = analyze_scores(scores, population_names, score_factor_names, method + "_" + loss)
    
    print "Plotting scores (" + method + " and loss " + loss + ")...\n"
    plot_factor_scores(factor_names, scores_df, method + "_" + loss, False)
    plot_factor_scores(factor_names, scores_df, method + "_" + loss, True)
    temp_dict = utilities.plot_time_signatures(scores_df[factor_names].as_matrix(), scores_df["snapshot_name"], factor_names, label, output_folder, method + "_" + loss + "/scores/", period_min)
    all_dict["scores"] = temp_dict
    
    plot_data_by_factor(all_dict, factor_names, label, output_folder, method + "_" + loss + "/summary_plots", period_min)
    
    #plot_data_by_factor(all_dict, factor_names, label, output_folder, method + "_" + loss + "/summary_plots", period_min)
    
    # Analyzing loadings
    print "Saving loadings to file (" + method + " and loss " + loss + ")...\n"
    loadings_df = analyze_loadings(loadings, factor_names, services_names, method + "_" + loss)
    
    return loadings_df, scores_df
示例#3
0
def efa(snapshot_filename, nfact, output_folder, rotation="none", method="ml", cor=True, period_min=60):
    #print "Dimension of x matrix:\n%s" % str(ro.r('dim(x)'))
    #print "Head of x matrix:\n%s" % str(ro.r('head(x)'))
    #ro.r('ev <- eigen(cor(x))') # get eigenvalues
    #ro.r('ap <- parallel(subject=nrow(x),var=ncol(x),rep=100,cent=.05)')
    #ro.r('nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)')
    #ro.r('plotnScree(nS)')
    #compute num factors
    
    print "\n\n\n***Performing factorial analysis with " + method + " fitting method, " + rotation + " rotational method...\n"
    ro.r('fit <- fa(r=x,nfactors=' + nfact + ',n.iter=1, min.err = 0.001,  max.iter = 50, \
            rotate="' + rotation + '", scores="regression", \
            residuals=TRUE, SMC=TRUE, missing=FALSE,impute="median",\
            warnings=TRUE, fm="' + method + '",\
            alpha=.1,p=.05,oblique.scores=FALSE,use="pairwise", \
            covar= ' + ("F" if cor else "T") + ', cor = "' + ("cor" if cor else "cov") + '")')
         
    print "\nPrinting a summary of the fitting matrix with method " + method + " and rotation " + rotation + "...\n" 
    ro.r('summary(fit)') # print variance accounted for 
    print("\n")
    
    print "\nPrinting the fitting matrix with method " + method + " and rotation " + rotation + "...\n" 
    ro.r('print(fit)')
    print("\n")
    
    print "\nPrinting the loadings matrix with method " + method + " and " + rotation + " rotation...\n" 
    ro.r('print(loadings(fit), digits=2, cutoff=0.4)')
    print("\n")
    
    loadings = np.asarray(ro.r('fit$loadings'))
    factor_names = np.asarray(ro.r('colnames(fit$loadings)'))
    cell_ids = np.asarray(ro.r('rownames(fit$loadings)'))
    
    traffic_matrix = np.asarray(ro.r('x'))
    traffic_snapshots = np.asarray(ro.r('rownames(x)'))
    traffic_base_stations = np.asarray(ro.r('colnames(x)'))

    full_scores_df = pd.DataFrame()
    for c in CATEGORIES:
        searched_category = ',' + c + ',' #2016-09-05.Monday,00:00:00,NI,
        output_subfolders = output_folder + "/" + method + "_" + rotation + "/" + c + "/"
        if not os.path.exists(os.path.dirname(output_subfolders)):
            os.makedirs(os.path.dirname(output_subfolders))
        found_elements = [(i, (v.split(",")[0]+','+v.split(",")[1])) for i, v in enumerate(traffic_snapshots) if searched_category in v]
        if len(found_elements) == 0:
            print("*** PROBLEM HERE: no element for category " + c)
            continue
        indices, c_traffic_snapshots = zip(*found_elements)
        indices = list(indices)
        c_traffic_snapshots = list(c_traffic_snapshots)
        c_traffic_matrix = traffic_matrix[indices]
        #print("*** For category " + c + "\nFound elements: " + str(found_elements) + "\nIndices: " + str(indices) + "\nc_traffic_matrix: " + str(c_traffic_matrix) + "\nCreated output folder: " + output_folder)
        signatures, signatures_df, _ = utilities.generate_signatures(c_traffic_matrix, c_traffic_snapshots, \
                                                     traffic_base_stations, loadings, factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/")
        
        utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min)
        utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, normalized_y=True)
        
        signatures, signatures_df, _ = utilities.generate_signatures(c_traffic_matrix, c_traffic_snapshots, \
                                                     traffic_base_stations, loadings, factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", threshold=0.4)
        utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, threshold=0.4)
        utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, normalized_y=True, threshold=0.4)
        
        signatures, signatures_df, _ = utilities.generate_signatures(c_traffic_matrix, c_traffic_snapshots, \
                                                     traffic_base_stations, loadings, factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", threshold=0.6)
        utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, threshold=0.6)
        utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, normalized_y=True, threshold=0.6)
        
        signatures, signatures_df, _ = utilities.generate_signatures(c_traffic_matrix, c_traffic_snapshots, \
                                                     traffic_base_stations, loadings, factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", threshold=0.8)
        utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, threshold=0.8)
        utilities.plot_time_signatures(signatures, signatures_df["index"], factor_names, label, output_folder, method + "_" + rotation + "/" + c + "/", period_min, normalized_y=True, threshold=0.8)
        
        #Analyzing scores
        scores = np.asarray(ro.r('fit$scores'))
        population_names = np.asarray(ro.r('rownames(fit$scores)'))
        score_factor_names = np.asarray(ro.r('colnames(fit$scores)'))
        
        found_elements = [(i, (v.split(",")[0]+','+v.split(",")[1])) for i, v in enumerate(population_names) if searched_category in v]
        if len(found_elements) == 0:
            print("*** PROBLEM HERE: no element for category " + c)
            continue
        indices, c_traffic_snapshots = zip(*found_elements)
        indices = list(indices)
        c_population_names = list(c_traffic_snapshots)
        c_scores = scores[indices]
        
        scores_df = analyze_scores(c_scores, c_population_names, score_factor_names, output_folder, method + "_" + rotation + "/" + c + "/")
        scores_df["category"] = c
        full_scores_df = full_scores_df.append(scores_df, ignore_index=True)
        
        print "Plotting scores (" + method + " and rotation " + rotation + ")...\n"
        plot_factor_scores(factor_names, scores_df, output_folder, method + "_" + rotation + "/" + c + "/", False)
        plot_factor_scores(factor_names, scores_df, output_folder, method + "_" + rotation + "/" + c + "/", True)
        
    #Analyzing loadings
    print "Saving loadings to file (" + method + " and rotation " + rotation + ")...\n"
    loadings_df = analyze_loadings(loadings, factor_names, cell_ids, output_folder, method + "_" + rotation + "/" + c + "/")
    
    #Analyzing uniqueness
    print "\nPrinting the uniqueness matrix with method " + method + " and " + rotation + " rotation...\n" 
    ro.r('print(fit$uniquenesses)')
    print("\n")
    
    uniquenesses = np.asarray(ro.r('fit$uniquenesses'))
    uniquenesses_names = np.asarray(ro.r('names(fit$uniquenesses)'))
    
    print "Saving uniquenesses to file (" + method + " and rotation " + rotation + ")...\n"
    uniquenesses_df = analyze_uniquenesses(uniquenesses, uniquenesses_names, output_folder, method + "_" + rotation + "/" + c + "/")
    
    output_filename = output_folder + "/" + method + "_" + rotation + "/all_loadings" + ("_" + label if label is not None else "") + ".csv"
    loadings_df.to_csv(output_filename, header=True, sep = ",", index=False)
    output_filename = output_folder + "/" + method + "_" + rotation + "/all_scores" + ("_" + label if label is not None else "") + ".csv"
    full_scores_df.to_csv(output_filename, header=True, sep = ",", index=False)
    output_filename = output_folder + "/" + method + "_" + rotation + "/all_uniquenesses" + ("_" + label if label is not None else "") + ".csv"
    uniquenesses_df.to_csv(output_filename, header=True, sep = ",", index=False)
    return loadings_df, full_scores_df, uniquenesses_df