def generate(): _db = Database("app.db") data_access = DataAccess(db=_db) data_access.generate_dataset("all_friends.csv") friends_list = pd.read_csv(get_output_file_path("all_friends.csv")) users = pd.read_csv(get_input_file_path("all_infected.csv")) user_ids = set(users['id']) friends_list['friends'] = friends_list.apply(lambda x: list(set(json.loads(x['friends'])) & user_ids), axis=1) friends_list.to_csv(get_output_file_path("filtered_friends.csv"), index=False, header=False, sep="\t")
def merge(ext_followers, source_candidates, out_file): print(f"Loading dataframe {ext_followers}") ext_followers_df = pd.read_csv(get_output_file_path(ext_followers), sep='\t') print(f"Loading dataframe {source_candidates}") source_candidates_df = pd.read_csv(get_output_file_path(source_candidates), sep='\t', names=["id", "source_candidates"]) df = ext_followers_df.join(source_candidates_df.set_index('id'), on='id') df["followers_list"] = None print(f"saving dataframe as {out_file}") df.to_csv(get_output_file_path(out_file), index=False)
def _merge(user_file, avg_file, out_file): user_df = pd.read_csv(get_input_file_path(user_file)) nd_df = pd.read_csv(get_output_file_path(avg_file), sep='\t', names=["user_id", "avg_neighbour_degree"]) df = user_df.join(nd_df.set_index('user_id'), on='user_id') print(f"saving dataframe as {out_file}") df.to_csv(get_output_file_path(out_file), index=False) df_filter = df.drop_duplicates(subset='avg_neighbour_degree', keep="last") df_filter = df_filter.sort_values(by=['avg_neighbour_degree'], ascending=False) df_filter.to_csv(get_output_file_path(f"filtered_{out_file}"), index=False)
def generate(self, in_file, out_file, start_time): sim_data = pd.read_csv(get_input_file_path(in_file)) sim_data['infection_source'] = sim_data['source_candidates'].map( lambda x: self.__find(sim_data, x)) out_columns = ['id', 'time_lapsed', 'infection_source'] temp_df = sim_data[out_columns] sorted_df = temp_df.sort_values(by=['time_lapsed']) initial_nodes = set() initial_links = list() dynamic_nodes = set() dynamic_links = list() for _, row in sorted_df.iterrows(): if row['time_lapsed'] <= start_time: initial_nodes.add(row['id']) if not np.isnan(row['infection_source']): initial_nodes.add(row['infection_source']) if not np.isnan(row['time_lapsed']): initial_links.append({ "source": row['infection_source'], "target": row['id'] }) else: dynamic_nodes.add(row['id']) if not np.isnan(row['infection_source']): dynamic_nodes.add(row['infection_source']) if not np.isnan(row['time_lapsed']): dynamic_links.append({ "source": row['infection_source'], "target": row['id'], "timeLapsed": row['time_lapsed'] }) data = { "initialData": { "nodes": list(map(lambda x: { "id": x, "group": 1 }, initial_nodes)), "links": initial_links }, "dynamicData": { "nodes": list(map(lambda x: {"id": x}, dynamic_nodes)), "links": dynamic_links } } with open(get_output_file_path(out_file), 'w') as fp: json.dump(data, fp) #generator = SimGraphDataGenerator() #generator.generate("givenchy_simulation_result_6hrs_6_hrs_model_retrained.csv", "6hrs_sim_graph_retrained.json", 360.0)
def plot(fname, size, save = False): plt.title(fname) plt.tight_layout() fig = matplotlib.pyplot.gcf() fig.set_size_inches(size[0], size[1], forward=True) plt.rcParams['figure.figsize'] = size if save: plt.savefig("{}.pdf".format(get_output_file_path(fname)), dpi=600) plt.show() plt.close()
def create_filter_set(filter_file): filter_df = pd.read_csv(get_output_file_path(filter_file), sep='\t', names=["id", "followers"]) id_set = set() for _, row in filter_df.iterrows(): followers = set(map(lambda x: int(x), json.loads(row["followers"]))) id_set.update(followers) return id_set
def json_to_csv(in_file, filter_file, out_file, event_day): filter_ids = create_filter_set(filter_file) with open(in_file, "r") as json_in: with open(get_output_file_path(out_file), "w") as csv_out: header = "\t".join(columns) csv_out.write(f"{header}\n") for line in tqdm(json_in, total=4417245): row = {} user = json.loads(line) if int(user["id"]) in filter_ids: created_at = datetime.strptime(user["created_at"], "%a %b %d %H:%M:%S %z %Y") ucd = event_day - created_at user_created_days = ucd.days if ucd.days > 0 else 1 row["id"] = user["id"] row["created_at"] = created_at.strftime( "%Y-%m-%d %H:%M:%S") row["favourites_count"] = user["favourites_count"] row["followers_count"] = user["followers_count"] row["friends_count"] = user["friends_count"] row["listed_count"] = user["listed_count"] row["statuses_count"] = user["statuses_count"] row["user_created_days"] = user_created_days row['normalized_statuses_count'] = row[ 'statuses_count'] / row['user_created_days'] row['normalized_followers_count'] = row[ 'followers_count'] / row['user_created_days'] row['normalized_favourites_count'] = row[ 'favourites_count'] / row['user_created_days'] row['normalized_listed_count'] = row['listed_count'] / row[ 'user_created_days'] row['normalized_friends_count'] = row[ 'friends_count'] / row['user_created_days'] row_collector = [] for col in columns: row_collector.append(str(row[col])) row = "\t".join(row_collector) csv_out.write(f"{row}\n")
def _compute_adv_degree(self): print("starting computation for avg neighbours degree") avg_file = f"avg_neighbour_degree_{self.adj_list_file}" with open(get_output_file_path(avg_file), 'w') as out: with open(get_input_file_path(self.adj_list_file), 'r') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') for row in reader: source = row[0] targets = json.loads(row[1]) neighbour_degree = 0 neighbour_count = 0 for t in targets: if t in self.adj_list: neighbour_degree += len(self.adj_list[t]) neighbour_count += 1 #neighbour_degree = neighbour_degree / len(targets) if neighbour_count == 0: neighbour_count = 1 avg_neighbour_degree = neighbour_degree / neighbour_count out.write(f"{source}\t{avg_neighbour_degree}\n") return avg_file
def generate_dataset(self, out_file): df = self.db.select_to_dataframe(self.SQL_ALL_USER) ensure_output_path() df.to_csv(get_output_file_path(out_file), index=False)
unpickled_file = pickle.load(infile) print(f'Loaded {len(unpickled_file)} entries') infile.close() return unpickled_file def save_pickle_file(path, data): print('Dumping data to path {}'.format(path)) with open(path, 'wb') as file: pickle.dump(data, file) print('Finished dumping data to path {}'.format(path)) current_time = 420 users = load_pickle_file(get_output_file_path("nyc_users_6_9_infected.dat")) ext_followers = pd.read_csv(get_output_file_path("nyc_6_9_ext_followers.csv")) def id_to_index_list(idx_lookup, src_candidate_ids): return list(filter(lambda x: x is not None, map(lambda x: idx_lookup.get(x, None), src_candidate_ids))) network_simulation = pd.DataFrame(columns=['id', 'time_lapsed', 'favourites_count', 'followers_count', 'friends_count', 'listed_count', 'statuses_count', 'source_candidates', 'source_index', 'seed_index', 'generation', 'time_since_seed', 'user_created_days', 'normalized_statuses_count', 'normalized_followers_count', 'normalized_favourites_count', 'normalized_listed_count', 'normalized_friends_count']) network_simulation['id']=users['id'].append(ext_followers['id'], ignore_index = True)