def main(): logging.basicConfig(filename='logs/user_profile_details.log', level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s') logger.info( "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -") logger.info("User Profile Details::") file_open = fu.FileOpen("data", "user-info.csv") logger.info(file_open) user_profile_details = UserProfileDetails(file_open) user_profile_details.process() logger.info(user_profile_details) user_id = 681473 username, profile_text = user_profile_details.find_user_profile(user_id) print("UserID: " + str(user_id) + " : " + str(username) + " : " + str(profile_text)) user_id = -1 username, profile_text = user_profile_details.find_user_profile(user_id) print("UserID: " + str(user_id) + " : " + str(username) + " : " + str(profile_text))
def load_sibling_data(self, filename): logger.info("Load Sibling Data: " + filename) file_open = fu.FileOpen(self.sibling_data_folder, filename) logger.info(file_open) data = dict() with open(file_open.absolute, 'r') as sibling_file: sibling_json = json.load(sibling_file) logger.debug(data) return sibling_json
def main(): logging.basicConfig(filename='logs/data_preprocessing.log', level=logging.DEBUG, format='%(asctime)s %(levelname)s: %(message)s') logger.info( "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -") logger.info("Tweet Data: Pre-Processing:") tweet_spatial_analysis_config = cu.TweetSpatialAnalysisConfig( "tweet_spatial_analysis.ini") logger.info(tweet_spatial_analysis_config) file_open = fu.FileOpen("data", "tweet-data.csv") logger.info(file_open) pre_processor = TweetDataPreProcessing(file_open, tweet_spatial_analysis_config) logger.info(pre_processor) pre_processor.process()
def __init__(self, config_filename): config = configparser.ConfigParser() config.read(config_filename) self.filename = config_filename self.latitude = self.parse_range_str(config['RANGES']['latitude']) self.longitude = self.parse_range_str(config['RANGES']['longitude']) self.count = self.parse_range_str(config['RANGES']['count']) self.area = self.parse_range_str(config['RANGES']['area']) self.distance = self.parse_range_str(config['RANGES']['distance']) self.ratio = self.parse_range_str(config['RANGES']['ratio']) self.dissolve = self.parse_range_str(config['RANGES']['dissolve']) self.bins_count = self.parse_range_str( config['HISTOGRAMS']['bins_count']) self.bins_count_text = self.create_bins_text(self.bins_count) self.bins_ratio = self.parse_range_str( config['HISTOGRAMS']['bins_ratio']) self.bins_ratio_text = self.create_bins_text(self.bins_ratio) absolute_folder = config['SIBLING_DATA']['folder'] folder_details = fu.FileOpen(absolute_folder) self.sibling_data_folder = folder_details.folder
from utils import user_profile_utilities as upu from utils import widget_utilities as wu logger = logging.getLogger() tweet_spatial_analysis_config = cu.TweetSpatialAnalysisConfig( "Tweet-Spatial-Analysis/conf/tweet_spatial_analysis.ini") logger.info(tweet_spatial_analysis_config) pre_processor = TweetDataPreProcessing(None) pre_processor.read_from_json( "Tweet-Spatial-Analysis/data/tweet_mean_all.json", "Tweet-Spatial-Analysis/data/tweets_median_working.json", "Tweet-Spatial-Analysis/data/tweets_median_non_working.json") file_open = fu.FileOpen("Tweet-Spatial-Analysis/data", "user-info.csv") user_info = upu.UserProfileDetails(file_open) user_info.process() tweet_data_controller = tdu.TweetDataController(pre_processor, tweet_spatial_analysis_config, user_info) map_widgets = wu.MapWidgets(tweet_data_controller, tweet_spatial_analysis_config) tweet_data_controller.selection_details = map_widgets.text_selection_details east_min, north_min = au.lon_lat_to_east_north( tweet_spatial_analysis_config.longitude[0], tweet_spatial_analysis_config.latitude[0]) east_max, north_max = au.lon_lat_to_east_north( tweet_spatial_analysis_config.longitude[1],
def build_overlap_matrix(rows, filename='./overlap_matrix.csv'): file_open = fu.FileOpen("data", "tweet-data.csv") tweet_spatial_analysis_config = cu.TweetSpatialAnalysisConfig("conf/tweet_spatial_analysis.ini") tdp = TweetDataPreProcessing(file_open, tweet_spatial_analysis_config) tdp.read_from_json("data/tweet_mean_all.json", "data/tweets_median_working.json", "data/tweets_median_non_working.json") df = tdp.tweet_data_working.df start_time = timeit.default_timer() comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() if rows == 0: rows, _ = df.shape if rank == 0: tweet_data_working_overlap = np.zeros((rows, rows)) for i in range(rows): for j in range(rows): if i == j: tweet_data_working_overlap[i, j] = 0 elif j < i: if df['a'][i] == 0 or df['b'][i] == 0 or df['a'][j] == 0 or df['b'][j] == 0: tweet_data_working_overlap[i, j] = -1 else: tweet_data_working_overlap[i, j] = au.are_two_ellipses_overlapping( df['x'][i], df['y'][i], df['a'][i], df['b'][i], df['angle'][i], df['x'][j], df['y'][j], df['a'][j], df['b'][j], df['angle'][j], ) if (i + 1) % 10 == 0: print('CPU {:04d}: {:6.2f}% accomplished'.format(rank, (i + 1) / rows * 100)) for i in range(rows): tweet_data_working_overlap[0:i, i] = tweet_data_working_overlap[i, 0:i] print('CPU {:04d}: Assigned transposed tril to triu (Time: {:.2f} seconds)'.format(rank, timeit.default_timer() - start_time)) tweet_data_working_overlap = pd.DataFrame(data=tweet_data_working_overlap, columns=df['id'][0:rows], index=df['id'][0:rows]) print('CPU {:04d}: Converted numpy array to pandas DataFrame (Time: {:.2f} seconds)'.format(rank, timeit.default_timer() - start_time)) data = find_components(tweet_data_working_overlap, filename, start_time) df, components, idx, col, idx_list, col_list = data number_of_components = len(components) df_dist_list = [] for i in range(number_of_components): D = APD_recursive(np.array(df.loc[idx_list[i],col_list[i]])) df_dist_list.append(pd.DataFrame(data=D, columns=col_list[i], index=idx_list[i])) print('CPU {:04d}: Constructing distance pandas DataFrame (Time: {:.2f} seconds)'.format(0, timeit.default_timer() - start_time)) df_dist = np.ones(df.shape) * -1 df_dist = pd.DataFrame(data=df_dist, columns=col, index=idx) # print(len(idx_list), len(df_dist_list)) for i in range(len(idx_list)): idx_local = df_dist_list[i].index col_local = df_dist_list[i].columns df_dist.loc[idx_local, col_local] = df_dist_list[i] filename_dis = filename.replace('.csv', '_distance.csv') print('CPU {:04d}: Now saving pandas DataFrame to {:s} (Time: {:.2f} seconds)'.format(0, filename_dis, timeit.default_timer() - start_time)) df_dist = df_dist.astype(np.int8) df_dist.to_csv(filename_dis, sep=',', header=True, index=True) elapsed_time = timeit.default_timer() - start_time hour = math.floor(elapsed_time / 3600) minute = math.floor((elapsed_time - hour * 3600) / 60) second = elapsed_time - 3600 * hour - 60 * minute print('Time elapsed: {:d} hours {:d} minutes {:.2f} seconds'.format(hour, minute, second))
def build_overlap_matrix_parallel(rows, filename='./overlap_matrix_parallel_block.csv'): start_time = timeit.default_timer() comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() status = MPI.Status() data = [] n_indices = 0 # print('Total CPUs: {:4d}, Rank {:4d}, rows: {:5d}, rows_per_cpu: {:5d}, row_start: {:5d}, cols_per_cpu: {:5d}, col_start: {:5d}'.format(size, rank, rows, rows_local, row_start, cols_local, col_start)) if rank == 0: file_open = fu.FileOpen("data", "tweet-data.csv") tweet_spatial_analysis_config = cu.TweetSpatialAnalysisConfig("conf/tweet_spatial_analysis.ini") tdp = TweetDataPreProcessing(file_open, tweet_spatial_analysis_config) tdp.read_from_json("data/tweet_mean_all.json", "data/tweets_median_working.json", "data/tweets_median_non_working.json") df = tdp.tweet_data_working.df if rows == 0: rows, _ = df.shape mpi4py.get_config() np.__config__.show() tweet_data_working_overlap = np.zeros((rows, rows)) indices = [(i, j) for i in range(rows) for j in range(rows) if j < i] n_indices = len(indices) n_indices = comm.bcast(n_indices, root=0) n_sent = 0 for k in range(min(size - 1, n_indices)): i, j = indices[n_sent] data = i, j, \ df['x'][i], df['y'][i], df['a'][i], df['b'][i], df['angle'][i], \ df['x'][j], df['y'][j], df['a'][j], df['b'][j], df['angle'][j] comm.send(data, dest=n_sent + 1, tag=n_sent) n_sent += 1 for k in range(n_indices): overlap = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) src = status.Get_source() tag = status.Get_tag() i, j = indices[tag] tweet_data_working_overlap[i, j] = overlap if n_sent < n_indices: i, j = indices[n_sent] data = i, j, \ df['x'][i], df['y'][i], df['a'][i], df['b'][i], df['angle'][i], \ df['x'][j], df['y'][j], df['a'][j], df['b'][j], df['angle'][j] comm.send(data, dest=src, tag=n_sent) n_sent += 1 else: comm.send(indices[0], dest=src, tag=60000) # print('CPU {:04d}: Received data from CPU {:04d} (Time: {:.2f} seconds)'.format(rank, src, timeit.default_timer() - start_time)) # if ((k + 1) / n_indices * 100) % 10 < 0.01: if (k + 1) % 1000000 == 0: print('CPU {:04d}: {:6.2f}% accomplished (Time: {:.2f} seconds)'.format(rank, (k + 1) / n_indices * 100, timeit.default_timer() - start_time)) tweet_data_working_overlap = tweet_data_working_overlap + tweet_data_working_overlap.T print('CPU {:04d}: Merged received data to global numpy array (Time: {:.2f} seconds)'.format(rank, timeit.default_timer() - start_time)) tweet_data_working_overlap = pd.DataFrame(data=tweet_data_working_overlap, columns=df['id'][0:rows], index=df['id'][0:rows]) print('CPU {:04d}: Converted numpy array to pandas DataFrame (Time: {:.2f} seconds)'.format(rank, timeit.default_timer() - start_time)) data = find_components(tweet_data_working_overlap, filename, start_time) df, components, idx, col, idx_list, col_list = data number_of_components = len(components) number_of_components = comm.bcast(number_of_components, root=0) print('CPU {:04d}: Found {:d} components (Time: {:.2f} seconds)'.format(rank, number_of_components, timeit.default_timer() - start_time)) nsent = 0 for i in range(min(size - 1, number_of_components)): comm.send(df.loc[idx_list[i],col_list[i]], dest=i + 1, tag=i) nsent += 1 df_dist_list = [] for i in range(number_of_components): D = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) src = status.Get_source() tag = status.Get_tag() # print('CPU {:04d}: Received data from CPU {:04d} (Time: {:.2f} seconds)'.format(rank, src, timeit.default_timer() - start_time)) df_dist_list.append(pd.DataFrame(data=D, columns=col_list[tag], index=idx_list[tag])) if nsent < number_of_components: comm.send(df.loc[idx_list[nsent],col_list[nsent]], dest=src, tag=nsent) else: comm.send([], dest=src, tag=number_of_components + 10) nsent += 1 print('CPU {:04d}: Constructing distance pandas DataFrame (Time: {:.2f} seconds)'.format(0, timeit.default_timer() - start_time)) df_dist = np.ones(df.shape) * -1 df_dist = pd.DataFrame(data=df_dist, columns=col, index=idx) # print(len(idx_list), len(df_dist_list)) for i in range(len(idx_list)): idx_local = df_dist_list[i].index col_local = df_dist_list[i].columns df_dist.loc[idx_local, col_local] = df_dist_list[i] filename_dis = filename.replace('.csv', '_distance.csv') print('CPU {:04d}: Now saving pandas DataFrame to {:s} (Time: {:.2f} seconds)'.format(0, filename_dis, timeit.default_timer() - start_time)) df_dist = df_dist.astype(np.int8) df_dist.to_csv(filename_dis, sep=',', header=True, index=True) elapsed_time = timeit.default_timer() - start_time hour = math.floor(elapsed_time / 3600) minute = math.floor((elapsed_time - hour * 3600) / 60) second = elapsed_time - 3600 * hour - 60 * minute print('Time elapsed: {:d} hours {:d} minutes {:.2f} seconds'.format(hour, minute, second)) if rank > 0: n_indices = comm.bcast(n_indices, root=0) if rank < n_indices: while True: data = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) tag = status.Get_tag() if tag == 60000: break i, j, x1, y1, a1, b1, t1, x2, y2, a2, b2, t2 = data if i == j: overlap = 0 elif j < i: if a1 == 0 or b1 == 0 or a2 == 0 or b2 == 0: overlap = -1 else: overlap = au.are_two_ellipses_overlapping(x1, y1, a1, b1, t1, x2, y2, a2, b2, t2) comm.send(overlap, dest=0, tag=tag) # print('CPU {:04d}: Sent data to CPU {:04d}'.format(rank, 0)) number_of_components = 0 number_of_components = comm.bcast(number_of_components, root=0) if rank < number_of_components: while True: A = comm.recv(source=0, tag=MPI.ANY_TAG, status=status) tag = status.Get_tag() if tag == number_of_components + 10: break m, n = A.shape if m > 1: print('CPU {:04d}: Starting APD for {:d}-by-{:d} matrix (Time: {:.2f} seconds)'.format(rank, m, n, timeit.default_timer() - start_time)) D = APD_recursive(np.array(A)) comm.send(D, dest=0, tag=tag)