def fit(self, p, samples=100): """Estimate the g-formula for network data under weak interference at coverage `p` """ marginals = [] for s in range(samples): # Selecting and applying binary treatment g = self.df.copy() g[self.exposure] = np.random.binomial(n=1, p=p, size=g.shape[0]) # Back-calculate updated exposure mapping v_vector = np.asarray(g[self.exposure]) g[self.exposure + '_sum'] = fast_exp_map(self.adj_matrix, v_vector, measure='sum') g[self.exposure + '_mean'] = fast_exp_map(self.adj_matrix, v_vector, measure='mean') g[self.exposure + '_mean'] = g[self.exposure + '_mean'].fillna(0) # isolates should have mean=0 g[self.exposure + '_var'] = fast_exp_map(self.adj_matrix, v_vector, measure='var') g[self.exposure + '_var'] = g[self.exposure + '_var'].fillna(0) # isolates should have var=0 g[self.exposure + '_mean_dist'] = fast_exp_map(self.adj_matrix, v_vector, measure='mean_dist') g[self.exposure + '_mean_dist'] = g[self.exposure + '_mean_dist'].fillna(0) # isolates have mean_dist=0 g[self.exposure + '_var_dist'] = fast_exp_map(self.adj_matrix, v_vector, measure='var_dist') g[self.exposure + '_var_dist'] = g[self.exposure + '_var_dist'].fillna(0) # isolates have var_dist=0 if self._thresholds_any_: create_threshold(data=g, variables=self._thresholds_variables_, thresholds=self._thresholds_, definitions=self._thresholds_def_) # Generating predictions for treatment plan g[self.outcome] = np.nan g[self.outcome] = self._outcome_model.predict(g) marginals.append(np.mean(g[self.outcome])) self.marginals_vector = marginals self.marginal_outcome = np.mean(marginals)
def __init__(self, network, exposure, outcome, verbose=False): """Implementation of the g-formula estimator described in Sofrygin & van der Laan 2017 """ # Background processing to convert network attribute data to pandas DataFrame df = network_to_df(network) if not df[exposure].value_counts().index.isin([0, 1]).all(): raise ValueError("NetworkGFormula only supports binary exposures currently") if df[outcome].value_counts().index.isin([0, 1]).all(): self._continuous_ = False else: self._continuous_ = True network = nx.convert_node_labels_to_integers(network, first_label=0, label_attribute='_original_id_') self.network = network self.adj_matrix = nx.adjacency_matrix(network, weight=None) self.exposure = exposure self.outcome = outcome # Creating variable mapping for all variables in the network for v in [var for var in list(df.columns) if var not in ['_original_id_', outcome]]: v_vector = np.asarray(df[v]) df[v + '_sum'] = fast_exp_map(self.adj_matrix, v_vector, measure='sum') df[v + '_mean'] = fast_exp_map(self.adj_matrix, v_vector, measure='mean') df[v + '_mean'] = df[v + '_mean'].fillna(0) # isolates should have mean=0 df[v + '_var'] = fast_exp_map(self.adj_matrix, v_vector, measure='var') df[v + '_var'] = df[v + '_var'].fillna(0) # isolates should have var=0 df[v + '_mean_dist'] = fast_exp_map(self.adj_matrix, v_vector, measure='mean_dist') df[v + '_mean_dist'] = df[v + '_mean_dist'].fillna(0) # isolates should have mean_dist=0 df[v + '_var_dist'] = fast_exp_map(self.adj_matrix, v_vector, measure='var_dist') df[v + '_var_dist'] = df[v + '_var_dist'].fillna(0) # isolates should have var_dist=0 # Calculating Degree degree_data = pd.DataFrame.from_dict(dict(network.degree), orient='index').rename(columns={0: 'degree'}) self.df = pd.merge(df, degree_data, how='left', left_index=True, right_index=True) # Output attributes self.marginals_vector = None self.marginal_outcome = None # Storage for items I need later self._outcome_model = None self._q_model = None self._verbose_ = verbose self._thresholds_ = [] self._thresholds_variables_ = [] self._thresholds_def_ = [] self._thresholds_any_ = False
def test_fast_exp_map_graph3(self): G = nx.complete_graph(5) a = [1, 1, 1, 0, 0] for node in G.nodes(): G.nodes[node]['A'] = a[node] npt.assert_equal( fast_exp_map(nx.adjacency_matrix(G, weight=None), np.array(a), measure='sum'), exp_map(G, 'A'))
def test_fast_exp_map_directed(self): G = nx.DiGraph() G.add_edges_from([(0, 1), (0, 2), (0, 3), (0, 4)]) a = [1, 0, 1, 1, 1] for node in G.nodes(): G.nodes[node]['A'] = a[node] npt.assert_equal( fast_exp_map(nx.adjacency_matrix(G, weight=None), np.array(a), measure='sum'), exp_map(G, 'A'))
def _generate_pooled_sample(self, p, samples): pooled_sample = [] for s in range(samples): g = self.df.copy() g[self.exposure] = np.random.binomial(n=1, p=p, size=g.shape[0]) g[self.exposure+'_sum'] = fast_exp_map(self.adj_matrix, np.array(g[self.exposure]), measure='sum') g[self.exposure + '_mean'] = fast_exp_map(self.adj_matrix, np.array(g[self.exposure]), measure='mean') g[self.exposure + '_mean'] = g[self.exposure + '_mean'].fillna(0) # isolates should have mean=0 g[self.exposure + '_var'] = fast_exp_map(self.adj_matrix, np.array(g[self.exposure]), measure='var') g[self.exposure + '_var'] = g[self.exposure + '_var'].fillna(0) # isolates should have mean=0 g[self.exposure + '_mean_dist'] = fast_exp_map(self.adj_matrix, np.array(g[self.exposure]), measure='mean_dist') g[self.exposure + '_mean_dist'] = g[self.exposure + '_mean_dist'].fillna(0) # isolates should have mean=0 g[self.exposure + '_var_dist'] = fast_exp_map(self.adj_matrix, np.array(g[self.exposure]), measure='var_dist') g[self.exposure + '_mean_dist'] = g[self.exposure + '_mean_dist'].fillna(0) # isolates should have mean=0 if self._gs_measure_ is None: network = self.network.copy() a = np.array(g[self.exposure]) for n in network.nodes(): network.node[n][self.exposure] = a[n] df = exp_map_individual(network, measure=self.exposure, max_degree=self._max_degree_).fillna(0) for c in self._nonparam_cols_: g[c] = df[c] if self._thresholds_any_: create_threshold(data=g, variables=self._thresholds_variables_, thresholds=self._thresholds_, definitions=self._thresholds_def_) g['_sample_id_'] = s pooled_sample.append(g) return pd.concat(pooled_sample, axis=0, ignore_index=True)
raise ValueError("Invalid set-up specification for " + network + " network") else: raise ValueError("Invalid network name in .sh script") # Determining if shift or absolute shift = bool(int(shift)) if shift: prop_treated = [-2.5, -2.0, -1.5, -1.0, -0.5, 0.5, 1.0, 1.5, 2.0, 2.5] # Generating probabilities (true) to assign data = network_to_df(G) adj_matrix = nx.adjacency_matrix(G, weight=None) data['O_mean'] = fast_exp_map(adj_matrix, np.array(data['O']), measure='mean') data['G_mean'] = fast_exp_map(adj_matrix, np.array(data['G']), measure='mean') prob = logistic.cdf(-1.3 - 1.5 * data['P'] + 1.5 * data['P'] * data['G'] + 0.95 * data['O_mean'] + 0.95 * data['G_mean']) log_odds = np.log(probability_to_odds(prob)) else: prop_treated = [ 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95 ] truth = truth_values(network=network,
raise ValueError("Invalid set-up specification for " + network + " network") else: raise ValueError("Invalid network name in .sh script") # Determining if shift or absolute shift = bool(int(shift)) if shift: prop_treated = [-2.5, -2.0, -1.5, -1.0, -0.5, 0.5, 1.0, 1.5, 2.0, 2.5] # Generating probabilities (true) to assign data = network_to_df(G) adj_matrix = nx.adjacency_matrix(G, weight=None) data['E_mean'] = fast_exp_map(adj_matrix, np.array(data['E']), measure='mean') prob = logistic.cdf(-0.5 + 0.05 * (data['B'] - 30) + 0.25 * data['G'] * data['E'] + 0.05 * data['E_mean']) log_odds = np.log(probability_to_odds(prob)) else: prop_treated = [ 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95 ] truth = truth_values(network=network, dgm=exposure, restricted_degree=restrict, shift=shift)