def run_on_network_attr(edgelist_filename, param_func_list, labels, outcome_bin_filename, binattr_filename=None, contattr_filename=None, catattr_filename=None, sampler_func = basicALAAMsampler, zone_filename = None, directed = False): """Run on specified network with binary and/or continuous and categorical attributes. Parameters: edgelist_filename - filename of Pajek format edgelist param_func_list - list of change statistic functions corresponding to parameters to estimate labels - list of strings corresponding to param_func_list to label output (header line) outcome_bin_filename - filename of binary attribute (node per line) of outcome variable for ALAAM binattr_filename - filename of binary attributes (node per line) Default None, in which case no binary attr. contattr_filename - filename of continuous attributes (node per line) Default None, in which case no continuous attr. catattr_filename - filename of continuous attributes (node per line) Default None, in which case no categorical attr. sampler_func - ALAAM sampler function with signature (G, A, changestats_func_list, theta, performMove, sampler_m); see basicALAAMsampler.py default basicALAAMsampler zone_filename - filename of snowball sampling zone file (header line 'zone' then zone number for nodes, one per line) Default None, in which case no snowball zones. If not None then the sampler_func should take account of snowball sample zones i.e. conditionalALAAMsampler() directed - Default False. True for directed network else undirected. Write output to stdout. """ assert(len(param_func_list) == len(labels)) if directed: G = Digraph(edgelist_filename, binattr_filename, contattr_filename, catattr_filename, zone_filename) else: G = Graph(edgelist_filename, binattr_filename, contattr_filename, catattr_filename, zone_filename) G.printSummary() outcome_binvar = list(map(int_or_na, open(outcome_bin_filename).read().split()[1:])) assert(len(outcome_binvar) == G.numNodes()) A = outcome_binvar assert( all([x in [0,1,NA_VALUE] for x in A]) ) print('positive outcome attribute = ', (float(A.count(1))/len(A))*100.0, '%') if NA_VALUE in A: print('Warning: outcome variable has', A.count(NA_VALUE), 'NA values') # Calculate observed statistics by summing change stats for each 1 variable Zobs = computeObservedStatistics(G, A, param_func_list) print('Zobs = ', Zobs) theta = np.zeros(len(param_func_list)) estimation_start = time.time() max_runs = 20 i = 0 converged = False while i < max_runs and not converged: i += 1 print('Running stochastic approximation (run', i,' of at most',max_runs,')...') start = time.time() (theta, std_error, t_ratio) = stochasticApproximation(G, A, param_func_list, theta, Zobs, sampler_func) print('Stochastic approximation took',time.time() - start, 's') if theta is None: print('Failed.') break print(' ',labels) print('theta =', theta) print('std_error =', std_error) print('t_ratio =', t_ratio) converged = np.all(np.abs(t_ratio) < 0.1) print('Total estimation time (',i,'runs) was',time.time() - estimation_start, 's') if converged: print('Converged.') significant = np.abs(theta) > 2 * std_error sys.stdout.write(20*' ' + ' Parameter Std.Error t-ratio\n') for j in range(len(theta)): sys.stdout.write('%20.20s % 6.3f % 6.3f % 6.3f %c\n' % (labels[j], theta[j], std_error[j], t_ratio[j], ('*' if significant[j] else ' '))) print() # Do goodness-of-fit test # change stats functions to add to GoF if not already in estimation if directed: # TODO GoF statistics for directed gof_param_func_list = list(param_func_list) goflabels = list(labels) else: statfuncs = [changeTwoStar, changeThreeStar, changePartnerActivityTwoPath, changeTriangleT1, changeContagion, changeIndirectPartnerAttribute, changePartnerAttributeActivity, changePartnerPartnerAttribute, changeTriangleT2, changeTriangleT3] statlabels = ['Two-Star', 'Three-Star', 'Alter-2Star1A', 'T1', 'Contagion', 'Alter-2Star2A', 'Partner-Activity', 'Partner-Resource','T2', 'T3'] gof_param_func_list = (list(param_func_list) + [f for f in statfuncs if f not in param_func_list]) goflabels = (list(labels) + [f for f in statlabels if f not in labels]) n = len(gof_param_func_list) assert len(goflabels) == n # pad theta vector with zeros for the added parameters gof_theta = np.array(list(theta) + (n-len(theta))*[0]) Ainitial = None # default: use random intialization if zone_filename is not None: # conditional estimation # For snowball conditional estimation, we must not start with # random initial outcome vector, but rather make sure the # nodes in the outermost zone have the same outcome attributes # as the obseved vector Ainitial = np.copy(A) # copy of observed vector # make vector of 50% ones, size of number of inner nodes Arandom_inner = rand_bin_array(int(0.5*len(G.inner_nodes)), len(G.inner_nodes)) # set the outcome for inner nodes to random values, leaving # value of outermost nodes at the original observed values Ainitial[G.inner_nodes] = Arandom_inner print('Running goodness-of-fit test...') start = time.time() gofresult = gof(G, A, gof_param_func_list, gof_theta, sampler_func = sampler_func, Ainitial = Ainitial) print('GoF took',time.time() - start, 's') print(' ',goflabels) print('t_ratios = ',gofresult) sys.stdout.write(20*' ' + ' t-ratio\n') for j in range(n): sys.stdout.write('%20.20s % 6.3f\n' % (goflabels[j], gofresult[j])) print()
def run_on_network_attr(edgelist_filename, param_func_list, labels, outcome_bin_filename, binattr_filename=None, contattr_filename=None, catattr_filename=None, EEiterations = 50000, run = None, learningRate = 0.01, sampler_func = basicALAAMsampler, zone_filename= None, directed = False): """Run on specified network with binary and/or continuous and categorical attributes. Parameters: edgelist_filename - filename of Pajek format edgelist param_func_list - list of change statistic functions corresponding to parameters to estimate labels - list of strings corresponding to param_func_list to label output (header line) outcome_bin_filename - filename of binary attribute (node per line) of outcome variable for ALAAM binattr_filename - filename of binary attributes (node per line) Default None, in which case no binary attr. contattr_filename - filename of continuous attributes (node per line) Default None, in which case no continuous attr. catattr_filename - filename of categorical attributes (node per line) Default None, in which case no categorical attr. EEiterations - Number of iterations of the EE algorithm. Default 50000. run - run number for parallel runs, used as suffix on output filenames. Default None in which case no suffix added to output files. learningRate - learning rate (step size multiplier, a) defult 0.01 sampler_func - ALAAM sampler function with signature (G, A, changestats_func_list, theta, performMove, sampler_m); see basicALAAMsampler.py default basicALAAMsampler zone_filename - filename of snowball sampling zone file (header line 'zone' then zone number for nodes, one per line) Default None, in which case no snowball zones. If not None then the sampler_func should take account of snowball sample zones i.e. conditionalALAAMsampler() directed - Default False. True for directed network else undirected. Write output to ifd_theta_values_<basename>_<run>.txt and ifd_dzA_values_<basename>_<run>.txt where <basename> is the baesname of edgelist filename e..g if edgelist_filename is edges.txt then ifd_theta_values_edges_0.txt and ifd_dzA_values_edges_0.txt etc. WARNING: these files are overwritten. """ assert(len(param_func_list) == len(labels)) basename = os.path.splitext(os.path.basename(edgelist_filename))[0] THETA_OUTFILENAME = THETA_PREFIX + basename DZA_OUTFILENAME = DZA_PREFIX + basename if run is not None: THETA_OUTFILENAME += '_' + str(run) DZA_OUTFILENAME += '_' + str(run) THETA_OUTFILENAME += os.extsep + 'txt' DZA_OUTFILENAME += os.extsep + 'txt' if directed: G = Digraph(edgelist_filename, binattr_filename, contattr_filename, catattr_filename, zone_filename) else: G = Graph(edgelist_filename, binattr_filename, contattr_filename, catattr_filename, zone_filename) G.printSummary() outcome_binvar = list(map(int_or_na, open(outcome_bin_filename).read().split()[1:])) assert(len(outcome_binvar) == G.numNodes()) A = outcome_binvar print('positive outcome attribute = ', (float(A.count(1))/len(A))*100.0, '%') assert( all([x in [0,1,NA_VALUE] for x in A]) ) if NA_VALUE in A: print('Warning: outcome variable has', A.count(NA_VALUE), 'NA values') A = np.array(A) # convert list to numpy vector # steps of Alg 1 M1 = 100 #OLD: Mouter = 500 # outer iterations of Algorithm EE #OLD: Msteps = 100 # multiplier for number of inner steps of Algorithm EE #OLD: print 'M1 = ', M1, ' Mouter = ', Mouter, ' Msteps = ', Msteps print('M1 = ', M1, ' EEiterations = ', EEiterations, end=' ') print('learningRate = ', learningRate, end=' ') theta_outfile = open(THETA_OUTFILENAME, 'w',1) # 1 means line buffering theta_outfile.write('t ' + ' '.join(labels) + ' ' + 'AcceptanceRate' + '\n') print('Running Algorithm S...', end=' ') start = time.time() (theta, Dmean) = algorithm_S(G, A, param_func_list, M1, theta_outfile, sampler_func) print(time.time() - start, 's') print('after Algorithm S:') print('theta = ', theta) print('Dmean = ', Dmean) dzA_outfile = open(DZA_OUTFILENAME, 'w',1) dzA_outfile.write('t ' + ' '.join(labels) + '\n') print('Running Algorithm EE...', end=' ') start = time.time() #OLD: theta = algorithm_EE(G, A, param_func_list, theta, Dmean, #OLD: Mouter, Msteps, theta_outfile, dzA_outfile) theta = algorithm_EE(G, A, param_func_list, theta, EEiterations, theta_outfile, dzA_outfile, learningRate, sampler_func) print(time.time() - start, 's') theta_outfile.close() dzA_outfile.close() print('at end theta = ', theta)