Пример #1
0
def run_on_network_attr(edgelist_filename, param_func_list, labels,
                        outcome_bin_filename,
                        binattr_filename=None,
                        contattr_filename=None,
                        catattr_filename=None,
                        sampler_func = basicALAAMsampler,
                        zone_filename = None,
                        directed = False):
    """Run on specified network with binary and/or continuous and
    categorical attributes.
    
    Parameters:
         edgelist_filename - filename of Pajek format edgelist 
         param_func_list   - list of change statistic functions corresponding
                             to parameters to estimate
         labels            - list of strings corresponding to param_func_list
                             to label output (header line)
         outcome_bin_filename - filename of binary attribute (node per line)
                                of outcome variable for ALAAM
         binattr_filename - filename of binary attributes (node per line)
                            Default None, in which case no binary attr.
         contattr_filename - filename of continuous attributes (node per line)
                            Default None, in which case no continuous attr.
         catattr_filename - filename of continuous attributes (node per line)
                            Default None, in which case no categorical attr.
         sampler_func        - ALAAM sampler function with signature
                               (G, A, changestats_func_list, theta, performMove,
                                sampler_m); see basicALAAMsampler.py
                               default basicALAAMsampler
         zone_filename   - filename of snowball sampling zone file 
                           (header line 'zone' then zone number for nodes,
                           one per line)
                           Default None, in which case no snowball zones.
                           If not None then the sampler_func should take
                           account of snowball sample zones i.e.
                           conditionalALAAMsampler()
         directed        - Default False. 
                           True for directed network else undirected.

    Write output to stdout.

    """
    assert(len(param_func_list) == len(labels))

    if directed:
        G = Digraph(edgelist_filename, binattr_filename, contattr_filename,
                    catattr_filename, zone_filename)
    else:
        G = Graph(edgelist_filename, binattr_filename, contattr_filename,
                  catattr_filename, zone_filename)
        

    G.printSummary()

    outcome_binvar = list(map(int_or_na, open(outcome_bin_filename).read().split()[1:]))
    assert(len(outcome_binvar) == G.numNodes())
    A = outcome_binvar

    assert( all([x in [0,1,NA_VALUE] for x in A]) )
    print('positive outcome attribute = ', (float(A.count(1))/len(A))*100.0, '%')
    if NA_VALUE in A:
        print('Warning: outcome variable has', A.count(NA_VALUE), 'NA values')

    # Calculate observed statistics by summing change stats for each 1 variable
    Zobs = computeObservedStatistics(G, A, param_func_list)
    print('Zobs = ', Zobs)

    theta = np.zeros(len(param_func_list))

    estimation_start = time.time()
    max_runs = 20
    i = 0
    converged = False
    while i < max_runs and not converged:
        i += 1
        print('Running stochastic approximation (run', i,' of at most',max_runs,')...')
        start = time.time()
        (theta, std_error, t_ratio) = stochasticApproximation(G, A,
                                                              param_func_list,
                                                              theta, Zobs,
                                                              sampler_func) 

        print('Stochastic approximation took',time.time() - start, 's')
        if theta is None:
            print('Failed.')
            break
        print('           ',labels)
        print('theta     =', theta)
        print('std_error =', std_error)
        print('t_ratio   =', t_ratio)

        converged = np.all(np.abs(t_ratio) < 0.1)

    print('Total estimation time (',i,'runs) was',time.time() - estimation_start, 's')
    if converged:
        print('Converged.')
        significant = np.abs(theta) > 2 * std_error
        sys.stdout.write(20*' ' + '  Parameter Std.Error t-ratio\n')
        for j in range(len(theta)):
            sys.stdout.write('%20.20s % 6.3f    % 6.3f    % 6.3f %c\n' % (labels[j], theta[j], std_error[j], t_ratio[j], ('*' if significant[j] else ' ')))
        print()

        # Do goodness-of-fit test

        # change stats functions to add to GoF if not already in estimation
        if directed:
            # TODO GoF statistics for directed
            gof_param_func_list = list(param_func_list)
            goflabels = list(labels)
        else:
            statfuncs = [changeTwoStar, changeThreeStar, changePartnerActivityTwoPath,
                         changeTriangleT1, changeContagion,
                         changeIndirectPartnerAttribute,
                         changePartnerAttributeActivity, 
                         changePartnerPartnerAttribute,
                         changeTriangleT2,
                         changeTriangleT3]
            statlabels = ['Two-Star', 'Three-Star', 'Alter-2Star1A',
                          'T1', 'Contagion', 'Alter-2Star2A', 'Partner-Activity',
                          'Partner-Resource','T2', 'T3']
            gof_param_func_list = (list(param_func_list) +
                                   [f for f in statfuncs
                                if f not in param_func_list])
            goflabels = (list(labels) + [f for f in statlabels
                                     if f not in labels])
        n = len(gof_param_func_list)
        assert len(goflabels) == n
        # pad theta vector with zeros for the added parameters
        gof_theta = np.array(list(theta) + (n-len(theta))*[0])

        Ainitial = None # default: use random intialization
        if zone_filename is not None: # conditional estimation
            # For snowball conditional estimation, we must not start with
            # random initial outcome vector, but rather make sure the
            # nodes in the outermost zone have the same outcome attributes
            # as the obseved vector
            Ainitial = np.copy(A) # copy of observed vector
            # make vector of 50% ones, size of number of inner nodes
            Arandom_inner = rand_bin_array(int(0.5*len(G.inner_nodes)), len(G.inner_nodes))
            # set the outcome for inner nodes to random values, leaving
            # value of outermost nodes at the original observed values
            Ainitial[G.inner_nodes] = Arandom_inner
        print('Running goodness-of-fit test...')
        start = time.time()
        gofresult = gof(G, A, gof_param_func_list, gof_theta,
                        sampler_func = sampler_func, Ainitial = Ainitial)
        print('GoF took',time.time() - start, 's')
        print('           ',goflabels)
        print('t_ratios = ',gofresult)
        
        sys.stdout.write(20*' ' + '  t-ratio\n')
        for j in range(n):
            sys.stdout.write('%20.20s % 6.3f\n' % (goflabels[j], gofresult[j]))
        print()
Пример #2
0
def run_on_network_attr(edgelist_filename, param_func_list, labels,
                        outcome_bin_filename,
                        binattr_filename=None,
                        contattr_filename=None,
                        catattr_filename=None,
                        EEiterations    = 50000,
                        run = None,
                        learningRate = 0.01,
                        sampler_func = basicALAAMsampler,
                        zone_filename= None,
                        directed = False):
    """Run on specified network with binary and/or continuous
    and categorical attributes.
    
    Parameters:
         edgelist_filename - filename of Pajek format edgelist 
         param_func_list   - list of change statistic functions corresponding
                             to parameters to estimate
         labels            - list of strings corresponding to param_func_list
                             to label output (header line)
         outcome_bin_filename - filename of binary attribute (node per line)
                                of outcome variable for ALAAM
         binattr_filename - filename of binary attributes (node per line)
                            Default None, in which case no binary attr.
         contattr_filename - filename of continuous attributes (node per line)
                            Default None, in which case no continuous attr.
         catattr_filename - filename of categorical attributes (node per line)
                            Default None, in which case no categorical attr.
         EEiterations     - Number of iterations of the EE algorithm.
                            Default 50000.
         run              - run number for parallel runs, used as suffix on 
                            output filenames. Default None
                            in which case no suffix added to output files.
         learningRate        - learning rate (step size multiplier, a)
                               defult 0.01
         sampler_func        - ALAAM sampler function with signature
                               (G, A, changestats_func_list, theta, performMove,
                                sampler_m); see basicALAAMsampler.py
                               default basicALAAMsampler
         zone_filename   - filename of snowball sampling zone file 
                           (header line 'zone' then zone number for nodes,
                           one per line)
                           Default None, in which case no snowball zones.
                           If not None then the sampler_func should take
                           account of snowball sample zones i.e.
                           conditionalALAAMsampler()
         directed        - Default False.
                           True for directed network else undirected.



    Write output to ifd_theta_values_<basename>_<run>.txt and
                    ifd_dzA_values_<basename>_<run>.txt
    where <basename> is the baesname of edgelist filename e..g
    if edgelist_filename is edges.txt then ifd_theta_values_edges_0.txt
    and ifd_dzA_values_edges_0.txt etc.
    WARNING: these files are overwritten.

    """
    assert(len(param_func_list) == len(labels))
    basename = os.path.splitext(os.path.basename(edgelist_filename))[0]
    THETA_OUTFILENAME = THETA_PREFIX + basename
    DZA_OUTFILENAME = DZA_PREFIX + basename
    if run is not None:
        THETA_OUTFILENAME += '_' + str(run)
        DZA_OUTFILENAME += '_' + str(run)
    THETA_OUTFILENAME += os.extsep + 'txt'
    DZA_OUTFILENAME   += os.extsep + 'txt'

    if directed:
        G = Digraph(edgelist_filename, binattr_filename, contattr_filename,
                    catattr_filename, zone_filename)
    else:
        G = Graph(edgelist_filename, binattr_filename, contattr_filename,
                  catattr_filename, zone_filename)

    G.printSummary()
    
    outcome_binvar = list(map(int_or_na, open(outcome_bin_filename).read().split()[1:]))
    assert(len(outcome_binvar) == G.numNodes())
    A = outcome_binvar
    print('positive outcome attribute = ', (float(A.count(1))/len(A))*100.0, '%')
    assert( all([x in [0,1,NA_VALUE] for x in A]) )

    if NA_VALUE in A:
        print('Warning: outcome variable has', A.count(NA_VALUE), 'NA values')

    A = np.array(A) # convert list to numpy vector
    
    # steps of Alg 1    
    M1 = 100

    #OLD: Mouter = 500 # outer iterations of Algorithm EE
    #OLD: Msteps = 100 # multiplier for number of inner steps of Algorithm EE
    #OLD: print 'M1 = ', M1, ' Mouter = ', Mouter, ' Msteps = ', Msteps

    print('M1 = ', M1, ' EEiterations = ', EEiterations, end=' ') 
    print('learningRate = ', learningRate, end=' ')
    
    theta_outfile = open(THETA_OUTFILENAME, 'w',1) # 1 means line buffering
    theta_outfile.write('t ' + ' '.join(labels) + ' ' + 'AcceptanceRate' + '\n')
    print('Running Algorithm S...', end=' ')
    start = time.time()
    (theta, Dmean) = algorithm_S(G, A, param_func_list, M1, theta_outfile,
                                 sampler_func)
    print(time.time() - start, 's')
    print('after Algorithm S:')
    print('theta = ', theta)
    print('Dmean = ', Dmean)
    dzA_outfile = open(DZA_OUTFILENAME, 'w',1)
    dzA_outfile.write('t ' + ' '.join(labels) + '\n')
    print('Running Algorithm EE...', end=' ')
    start = time.time()
    #OLD: theta = algorithm_EE(G, A, param_func_list, theta, Dmean,
    #OLD:                     Mouter, Msteps, theta_outfile, dzA_outfile)
    theta = algorithm_EE(G, A, param_func_list, theta, 
                         EEiterations, theta_outfile, dzA_outfile, learningRate,
                         sampler_func)

    print(time.time() - start, 's')
    theta_outfile.close()
    dzA_outfile.close()
    print('at end theta = ', theta)