def R_Factor_Analysis( comm_str, csv_data, csv_colvars, csv_coltypes, fpref, test_arr, # -> can be NULL for interior calc Nfac, # -> can be 0 for interior calc Ntopload, # -> can be 0 for interior calc flab, DO_GRAPH, N_cent = 99, # 'centile' N_iter = 5000, # 'iterations' ftype = 'jpeg'): '''Perform factor analysis using R function factanal(). User can specify the number of latent factors using the paran() function, which implements Horn's test. Returns: Factor scores and loadings''' # R libraries used here. paran = importr('paran') # some stuff about format types if PARN_OUT_types.__contains__(ftype): ii = PARN_OUT_types.index(ftype) OUT_dev = PARN_OUT_devs[ii] OUT_paran = fpref+'.'+ftype else: print "** Error! ", print "Output file type '%s' is not valid. Select from:" % (ftype) print "\t", for x in PARN_OUT_types: print " '"+x+"' ", print "\n" sys.exit(32) fff = open(fpref+'.log','w') if comm_str: fff.write('# '+comm_str+"\n") # SETUP THE VARIABLE VALUES Lx,Ly = np.shape(csv_data) # if user hasn't entered a selection, then use 'em all. if not(test_arr): test_arr = list(csv_colvars) # Get rid of variable columns with 'NA' test_arr = Cut_ColVars_with_NAs(csv_data, csv_colvars, test_arr) # check for duplicate columns, which lead to bad singularities test_arr = CheckForDuplicates( test_arr ) # if user hasn't entered a label, then use: if not(flab): flab = 'FACTOR' # only select variables that are represented in the csv_data headings, # as well as being either int or float VARS_inds = [] VARS_names = [] for x in test_arr: if csv_colvars.__contains__(x): ii = csv_colvars.index(x) if [int, float].__contains__(csv_coltypes[ii]): VARS_inds.append(ii) VARS_names.append(x) Nvars = len(VARS_names) Y = np.zeros((Lx,Nvars), dtype=float) print "++ Factor analysis contains %s variables:" % (Nvars) fff.write("\n++ Factor analysis contains %s variables:\n" % (Nvars)) for j in range(Nvars): jj = VARS_inds[j] print "\t %s" % (VARS_names[j]) fff.write("\t %s\n" % (VARS_names[j])) for i in range(Lx): Y[i,j] = csv_data[i][jj] i = CorMatCheck(Y, VARS_names) # SETUP THE NUMBER OF FACTORS # use eval info to pick number of vars, if user hasn't if not(Nfac): print "++ Graphing of parallel analysis (PA) Horn's test is:", if DO_GRAPH: print "ON." else: print "OFF." print "++ PA percentile in Horn's test is: ", N_cent print "++ Number of PA Monte Carlo iterations: ", N_iter # mostly default values, some user control PARN = r.paran( Y, iterations=N_iter, centile=N_cent, quietly=False, status=True, all=True, cfa=True, graph=DO_GRAPH, color=True, col=r.c("black","red","blue"), lty=r.c(1,2,3), lwd=1, legend=True, file=OUT_paran, width=640, height=640, grdevice=OUT_dev, seed=0) if DO_GRAPH: grDevices.dev_off() print "++ Don't worry about the briefly passing image." print "\tIt has been saved as: %s\n\n" % ( OUT_paran ) N_PARN_arr = np.array(PARN.rx2('Retained')) Nfac = int(N_PARN_arr[0]) else: if Nfac > Nvars: print "*+ Warning! The user has selected a number of factors larger" print "\tthan the number of variables (%d > %d)!" % (Nfac, Nvars) print "\t-> Therefore, we're setting it to be %d," % (Nvars) print "\t but you might still want to check if anything went awry?" else: print "++ The user has selected the number of factors" print "\tto be %d out of %d." % (Nfac, Nvars) # RUN THE FACTOR ANALYSIS IN R FA_out = r.factanal(Y, factors=Nfac, scores='regression', rotation="varimax") FA_scores =np.array(FA_out.rx2('scores')) FA_loadings =np.array(FA_out.rx2('loadings')) # match up highest loadings with the variable names, so we have an # idea of what's going into the sausage # how many loadings to output. # Can be: ALL, 5, or user-entered other if not(Ntopload): Ntopload = min(Nvars, 5) elif Ntopload<0 : Ntopload = Nvars else: Ntopload = min(Nvars, Ntopload) if Ntopload==Nvars: strNtopload = "ALL "+str(Nvars) else: strNtopload = 'top '+str(Ntopload)+'/'+str(Nvars) # ordering process FA_titles = [] print "\n++ Factor loading contributions (%s):" % (strNtopload) fff.write("\n++ Factor loading contributions (%s):\n" % (strNtopload)) for i in range(Nfac): P = list(FA_loadings[:,i]) Q = list(VARS_names) PQ = sorted(zip(P,Q),reverse=1) str_title = "%s_%02d" % (flab, i+1) FA_titles.append(str_title) print "\n\t"+str_title fff.write("\n\t"+str_title+"\n") for j in range(Ntopload): print "\t%20s %12.5f" % (PQ[j][1],PQ[j][0]) fff.write("\t%20s %12.5f\n" % (PQ[j][1],PQ[j][0])) fff.close() return FA_scores, FA_titles, VARS_names
############################# # Let's get going! filthresh=0.6 filtper=94 nComponents=15 numSkills=6 # Number of Skills to display permitted=allDat.columns for count1 in range(10): ad2=allDat[permitted] ad3=np.array(ad2) #added by Ioannis # Factor Analysis #lff=pd.DataFrame(np.array(r.factanal(ad2,nComponents)[1])).T lff=pd.DataFrame(np.array(r.factanal(ad3,nComponents,scores='regression', rotation = "varimax")[1])).T fit = r.factanal(ad3,nComponents, scores='regression', rotation = "varimax") corr = fit[3] scores= np.array(fit.rx2('scores')) if True: # If want compatibility with ICIS paper set to False filthresh=np.percentile(lff,filtper) # Compiling list of "permitted" elements permitted=[] for count2 in range(len(lff[0])): foo=ad2.columns[lff.iloc[count2,:].apply(abs)>filthresh] if len(foo)>2: permitted.extend(foo) permitted=np.unique(permitted) print "#Permitted: "+str(len(permitted))
def fa(source=False, use_filter="default", data_file="latest", participant_subset="", drop_metadata=True, drop=[], clean=7, factors=5, facecolor="#ffffff"): #gets config file: config = get_config_file(localpath=path.dirname(path.realpath(__file__))+'/') #IMPORT VARIABLES if not source: source = config.get('Source', 'source') data_path = config.get('Addresses', source) filter_dir = config.get('Paths', "filter_dir") filter_name = config.get("Filters", use_filter) #END IMPORT VARIABLES filter_path = path.dirname(path.realpath(__file__)) + '/' + filter_dir + filter_name + '.csv' filters = DataFrame.from_csv(filter_path, header=None).transpose() # transpose filters because of .csv file formatting all_data = DataFrame.from_csv(data_path + data_file + ".csv") all_data = all_data.reset_index(level=0) #~ print filters["metadata"] #clean data of respondents who only ckeck extreme answers: all_data = all_data[map(lambda y: len(set(y)) > clean,np.array(all_data))] if drop_metadata == True: # drops metadata all_data = all_data.drop(filters["metadata"][Series.notnull(filters["metadata"])], axis=1) drop_list = [] for drop_item in drop: # compile list of column names to be dropped: drop_list += list(filters[drop_item][Series.notnull(filters[drop_item])]) #get unique column names (the list may contain duplicates if overlaying multiple filters): drop_list = list(set(drop_list)) all_data = all_data.drop(drop_list, axis=1) if participant_subset == "odd": # selects only odd indexes (keep the other dataset half for validation) keep_rows = all_data.index.values[1::2] filtered_data = all_data.ix[keep_rows] elif participant_subset == "even": # selects only even indexes (keep the other dataset half for validation) keep_rows = all_data.index.values[0::2] filtered_data = all_data.ix[keep_rows] elif participant_subset == "male": # selects only male participants filtered_data = all_data[all_data['My legal gender:'] == 'Male'] elif participant_subset == "female": # selects only female participants filtered_data = all_data[all_data['My legal gender:'] == 'Female'] else: filtered_data = all_data #convert to correct type for analysis: filtered_data_array = np.array(filtered_data, dtype='float64') filtered_data_array = filtered_data_array / 100 fit = r.factanal(filtered_data_array, factors, rotation='promax') load = r.loadings(fit) load = numpy2ri.ri2numpy(load) load = r.t(load) remapped_cmap = remappedColorMap(cm.PiYG, start=(np.max(load)-abs(np.min(load)))/(2*np.max(load)), midpoint=abs(np.min(load))/(np.max(load)+abs(np.min(load))), name='shrunk') fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(17.5, 5), facecolor=facecolor) graphic = ax.imshow(load, cmap = remapped_cmap, interpolation='none') ax.xaxis.set_major_locator(matplotlib.ticker.MultipleLocator(base=1.0)) ax.yaxis.set_major_locator(matplotlib.ticker.MultipleLocator(base=1.0)) ax.set_xticklabels([0]+filtered_data.columns.tolist(),fontsize=8,rotation=90) ax.set_yticklabels(np.arange(factors+1)) ax.set_ylabel('Factors') ax.set_title("Question Loadings on Factors") #Recolor plot spines: for spine_side in ["bottom", "top", "left", "right"]: ax.spines[spine_side].set_color("#777777") #Remove ticks: plt.tick_params(axis='both', which='both', left="off", right="off", bottom='off', top='off') divider = make_axes_locatable(ax) #calculate width for cbar so that it is equal to the question column width: cbar_width = str(100/np.shape(load)[1])+ "%" cax = divider.append_axes("right", size=cbar_width, pad=0.05) cbar = colorbar(graphic, cax=cax, drawedges=True) #Limit the number of ticks: tick_locator = ticker.MaxNLocator(nbins=6) cbar.locator = tick_locator cbar.update_ticks() #Align ticklabels so that negative values are not misaligned (meaning right align): for t in cbar.ax.get_yticklabels(): t.set_horizontalalignment('right') t.set_x(0.045*(np.shape(load)[1]+6)) #Tweak color bar borders cbar.outline.set_color("#666666") cbar.dividers.set_linewidth(0)