Пример #1
0
def main():
    logging.basicConfig(filename='logs/user_profile_details.log',
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s: %(message)s')

    logger.info(
        "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
    logger.info("User Profile Details::")

    file_open = fu.FileOpen("data", "user-info.csv")
    logger.info(file_open)

    user_profile_details = UserProfileDetails(file_open)
    user_profile_details.process()
    logger.info(user_profile_details)

    user_id = 681473
    username, profile_text = user_profile_details.find_user_profile(user_id)
    print("UserID: " + str(user_id) + " : " + str(username) + " : " +
          str(profile_text))

    user_id = -1
    username, profile_text = user_profile_details.find_user_profile(user_id)
    print("UserID: " + str(user_id) + " : " + str(username) + " : " +
          str(profile_text))
Пример #2
0
    def load_sibling_data(self, filename):
        logger.info("Load Sibling Data: " + filename)
        file_open = fu.FileOpen(self.sibling_data_folder, filename)
        logger.info(file_open)

        data = dict()
        with open(file_open.absolute, 'r') as sibling_file:
            sibling_json = json.load(sibling_file)

        logger.debug(data)
        return sibling_json
Пример #3
0
def main():
    logging.basicConfig(filename='logs/data_preprocessing.log',
                        level=logging.DEBUG,
                        format='%(asctime)s %(levelname)s: %(message)s')

    logger.info(
        "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
    logger.info("Tweet Data: Pre-Processing:")

    tweet_spatial_analysis_config = cu.TweetSpatialAnalysisConfig(
        "tweet_spatial_analysis.ini")
    logger.info(tweet_spatial_analysis_config)

    file_open = fu.FileOpen("data", "tweet-data.csv")
    logger.info(file_open)

    pre_processor = TweetDataPreProcessing(file_open,
                                           tweet_spatial_analysis_config)
    logger.info(pre_processor)
    pre_processor.process()
Пример #4
0
    def __init__(self, config_filename):
        config = configparser.ConfigParser()
        config.read(config_filename)

        self.filename = config_filename
        self.latitude = self.parse_range_str(config['RANGES']['latitude'])
        self.longitude = self.parse_range_str(config['RANGES']['longitude'])
        self.count = self.parse_range_str(config['RANGES']['count'])
        self.area = self.parse_range_str(config['RANGES']['area'])
        self.distance = self.parse_range_str(config['RANGES']['distance'])
        self.ratio = self.parse_range_str(config['RANGES']['ratio'])
        self.dissolve = self.parse_range_str(config['RANGES']['dissolve'])

        self.bins_count = self.parse_range_str(
            config['HISTOGRAMS']['bins_count'])
        self.bins_count_text = self.create_bins_text(self.bins_count)

        self.bins_ratio = self.parse_range_str(
            config['HISTOGRAMS']['bins_ratio'])
        self.bins_ratio_text = self.create_bins_text(self.bins_ratio)

        absolute_folder = config['SIBLING_DATA']['folder']
        folder_details = fu.FileOpen(absolute_folder)
        self.sibling_data_folder = folder_details.folder
Пример #5
0
from utils import user_profile_utilities as upu
from utils import widget_utilities as wu

logger = logging.getLogger()

tweet_spatial_analysis_config = cu.TweetSpatialAnalysisConfig(
    "Tweet-Spatial-Analysis/conf/tweet_spatial_analysis.ini")
logger.info(tweet_spatial_analysis_config)

pre_processor = TweetDataPreProcessing(None)
pre_processor.read_from_json(
    "Tweet-Spatial-Analysis/data/tweet_mean_all.json",
    "Tweet-Spatial-Analysis/data/tweets_median_working.json",
    "Tweet-Spatial-Analysis/data/tweets_median_non_working.json")

file_open = fu.FileOpen("Tweet-Spatial-Analysis/data", "user-info.csv")
user_info = upu.UserProfileDetails(file_open)
user_info.process()

tweet_data_controller = tdu.TweetDataController(pre_processor,
                                                tweet_spatial_analysis_config,
                                                user_info)
map_widgets = wu.MapWidgets(tweet_data_controller,
                            tweet_spatial_analysis_config)
tweet_data_controller.selection_details = map_widgets.text_selection_details

east_min, north_min = au.lon_lat_to_east_north(
    tweet_spatial_analysis_config.longitude[0],
    tweet_spatial_analysis_config.latitude[0])
east_max, north_max = au.lon_lat_to_east_north(
    tweet_spatial_analysis_config.longitude[1],
Пример #6
0
def build_overlap_matrix(rows, filename='./overlap_matrix.csv'):
    file_open = fu.FileOpen("data", "tweet-data.csv")
    tweet_spatial_analysis_config = cu.TweetSpatialAnalysisConfig("conf/tweet_spatial_analysis.ini")
    tdp = TweetDataPreProcessing(file_open, tweet_spatial_analysis_config)
    tdp.read_from_json("data/tweet_mean_all.json",
                       "data/tweets_median_working.json",
                       "data/tweets_median_non_working.json")
    df = tdp.tweet_data_working.df
    start_time = timeit.default_timer()
    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()
    if rows == 0:
        rows, _ = df.shape
    if rank == 0:
        tweet_data_working_overlap = np.zeros((rows, rows))
        for i in range(rows):
            for j in range(rows):
                if i == j:
                    tweet_data_working_overlap[i, j] = 0
                elif j < i:
                    if df['a'][i] == 0 or df['b'][i] == 0 or df['a'][j] == 0 or df['b'][j] == 0:
                        tweet_data_working_overlap[i, j] = -1
                    else:
                        tweet_data_working_overlap[i, j] = au.are_two_ellipses_overlapping(
                            df['x'][i], df['y'][i], df['a'][i], df['b'][i], df['angle'][i],
                            df['x'][j], df['y'][j], df['a'][j], df['b'][j], df['angle'][j],
                        )
            if (i + 1) % 10 == 0:
                print('CPU {:04d}: {:6.2f}% accomplished'.format(rank,  (i + 1) / rows * 100))

        for i in range(rows):
            tweet_data_working_overlap[0:i, i] = tweet_data_working_overlap[i, 0:i]
        print('CPU {:04d}: Assigned transposed tril to triu (Time: {:.2f} seconds)'.format(rank, timeit.default_timer() - start_time))

        tweet_data_working_overlap = pd.DataFrame(data=tweet_data_working_overlap, columns=df['id'][0:rows], index=df['id'][0:rows])
        print('CPU {:04d}: Converted numpy array to pandas DataFrame (Time: {:.2f} seconds)'.format(rank, timeit.default_timer() - start_time))

        data = find_components(tweet_data_working_overlap, filename, start_time)
        df, components, idx, col, idx_list, col_list = data
        number_of_components = len(components)

        df_dist_list = []
        for i in range(number_of_components):
            D = APD_recursive(np.array(df.loc[idx_list[i],col_list[i]]))
            df_dist_list.append(pd.DataFrame(data=D, columns=col_list[i], index=idx_list[i]))


        print('CPU {:04d}: Constructing distance pandas DataFrame (Time: {:.2f} seconds)'.format(0, timeit.default_timer() - start_time))
        df_dist = np.ones(df.shape) * -1
        df_dist = pd.DataFrame(data=df_dist, columns=col, index=idx)
        # print(len(idx_list), len(df_dist_list))

        for i in range(len(idx_list)):
            idx_local = df_dist_list[i].index
            col_local = df_dist_list[i].columns
            df_dist.loc[idx_local, col_local] = df_dist_list[i]

        filename_dis = filename.replace('.csv', '_distance.csv')

        print('CPU {:04d}: Now saving pandas DataFrame to {:s} (Time: {:.2f} seconds)'.format(0, filename_dis, timeit.default_timer() - start_time))
        df_dist = df_dist.astype(np.int8)
        df_dist.to_csv(filename_dis, sep=',', header=True, index=True)

        elapsed_time = timeit.default_timer() - start_time
        hour = math.floor(elapsed_time / 3600)
        minute = math.floor((elapsed_time - hour * 3600) / 60)
        second = elapsed_time - 3600 * hour - 60 * minute
        print('Time elapsed: {:d} hours {:d} minutes {:.2f} seconds'.format(hour, minute, second))
Пример #7
0
def build_overlap_matrix_parallel(rows, filename='./overlap_matrix_parallel_block.csv'):
    start_time = timeit.default_timer()
    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()
    status = MPI.Status()
    data = []
    n_indices = 0
    # print('Total CPUs: {:4d}, Rank {:4d}, rows: {:5d}, rows_per_cpu: {:5d}, row_start: {:5d}, cols_per_cpu: {:5d}, col_start: {:5d}'.format(size, rank, rows, rows_local, row_start, cols_local, col_start))

    if rank == 0:
        file_open = fu.FileOpen("data", "tweet-data.csv")
        tweet_spatial_analysis_config = cu.TweetSpatialAnalysisConfig("conf/tweet_spatial_analysis.ini")
        tdp = TweetDataPreProcessing(file_open, tweet_spatial_analysis_config)
        tdp.read_from_json("data/tweet_mean_all.json",
                           "data/tweets_median_working.json",
                           "data/tweets_median_non_working.json")
        df = tdp.tweet_data_working.df
        if rows == 0:
            rows, _ = df.shape
        mpi4py.get_config()
        np.__config__.show()
        tweet_data_working_overlap = np.zeros((rows, rows))
        indices = [(i, j) for i in range(rows) for j in range(rows) if j < i]
        n_indices = len(indices)
        n_indices = comm.bcast(n_indices, root=0)
        n_sent = 0
        for k in range(min(size - 1, n_indices)):
            i, j = indices[n_sent]
            data =  i, j, \
                    df['x'][i], df['y'][i], df['a'][i], df['b'][i], df['angle'][i], \
                    df['x'][j], df['y'][j], df['a'][j], df['b'][j], df['angle'][j]
            comm.send(data, dest=n_sent + 1, tag=n_sent)
            n_sent += 1

        for k in range(n_indices):
            overlap = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
            src = status.Get_source()
            tag = status.Get_tag()

            i, j = indices[tag]
            tweet_data_working_overlap[i, j] = overlap
            if n_sent < n_indices:

                i, j = indices[n_sent]
                data =  i, j, \
                        df['x'][i], df['y'][i], df['a'][i], df['b'][i], df['angle'][i], \
                        df['x'][j], df['y'][j], df['a'][j], df['b'][j], df['angle'][j]
                comm.send(data, dest=src, tag=n_sent)
                n_sent += 1
            else:
                comm.send(indices[0], dest=src, tag=60000)
            # print('CPU {:04d}: Received data from CPU {:04d} (Time: {:.2f} seconds)'.format(rank,  src, timeit.default_timer() - start_time))

            # if ((k + 1) / n_indices * 100) % 10 < 0.01:
            if (k + 1) % 1000000 == 0:
                print('CPU {:04d}: {:6.2f}% accomplished (Time: {:.2f} seconds)'.format(rank,  (k + 1) / n_indices * 100, timeit.default_timer() - start_time))

        tweet_data_working_overlap = tweet_data_working_overlap + tweet_data_working_overlap.T
        print('CPU {:04d}: Merged received data to global numpy array (Time: {:.2f} seconds)'.format(rank, timeit.default_timer() - start_time))
        tweet_data_working_overlap = pd.DataFrame(data=tweet_data_working_overlap, columns=df['id'][0:rows], index=df['id'][0:rows])
        print('CPU {:04d}: Converted numpy array to pandas DataFrame (Time: {:.2f} seconds)'.format(rank, timeit.default_timer() - start_time))

        data = find_components(tweet_data_working_overlap, filename, start_time)
        df, components, idx, col, idx_list, col_list = data
        number_of_components = len(components)
        number_of_components = comm.bcast(number_of_components, root=0)
        print('CPU {:04d}: Found {:d} components (Time: {:.2f} seconds)'.format(rank, number_of_components, timeit.default_timer() - start_time))

        nsent = 0
        for i in range(min(size - 1, number_of_components)):
            comm.send(df.loc[idx_list[i],col_list[i]], dest=i + 1, tag=i)
            nsent += 1

        df_dist_list = []
        for i in range(number_of_components):
            D = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
            src = status.Get_source()
            tag = status.Get_tag()

            # print('CPU {:04d}: Received data from CPU {:04d} (Time: {:.2f} seconds)'.format(rank, src, timeit.default_timer() - start_time))
            df_dist_list.append(pd.DataFrame(data=D, columns=col_list[tag], index=idx_list[tag]))

            if nsent < number_of_components:
                comm.send(df.loc[idx_list[nsent],col_list[nsent]], dest=src, tag=nsent)
            else:
                comm.send([], dest=src, tag=number_of_components + 10)

            nsent += 1

        print('CPU {:04d}: Constructing distance pandas DataFrame (Time: {:.2f} seconds)'.format(0, timeit.default_timer() - start_time))
        df_dist = np.ones(df.shape) * -1
        df_dist = pd.DataFrame(data=df_dist, columns=col, index=idx)
        # print(len(idx_list), len(df_dist_list))

        for i in range(len(idx_list)):
            idx_local = df_dist_list[i].index
            col_local = df_dist_list[i].columns
            df_dist.loc[idx_local, col_local] = df_dist_list[i]

        filename_dis = filename.replace('.csv', '_distance.csv')

        print('CPU {:04d}: Now saving pandas DataFrame to {:s} (Time: {:.2f} seconds)'.format(0, filename_dis, timeit.default_timer() - start_time))
        df_dist = df_dist.astype(np.int8)
        df_dist.to_csv(filename_dis, sep=',', header=True, index=True)

        elapsed_time = timeit.default_timer() - start_time
        hour = math.floor(elapsed_time / 3600)
        minute = math.floor((elapsed_time - hour * 3600) / 60)
        second = elapsed_time - 3600 * hour - 60 * minute
        print('Time elapsed: {:d} hours {:d} minutes {:.2f} seconds'.format(hour, minute, second))

    if rank > 0:
        n_indices = comm.bcast(n_indices, root=0)
        if rank < n_indices:
            while True:
                data = comm.recv(source=0, tag=MPI.ANY_TAG, status=status)
                tag = status.Get_tag()
                if tag == 60000:
                    break
                i, j, x1, y1, a1, b1, t1, x2, y2, a2, b2, t2 = data
                if i == j:
                    overlap = 0
                elif j < i:
                    if a1 == 0 or b1 == 0 or a2 == 0 or b2 == 0:
                        overlap = -1
                    else:
                        overlap = au.are_two_ellipses_overlapping(x1, y1, a1, b1, t1, x2, y2, a2, b2, t2)
                comm.send(overlap, dest=0, tag=tag)
                # print('CPU {:04d}: Sent data to CPU {:04d}'.format(rank,  0))

        number_of_components = 0
        number_of_components = comm.bcast(number_of_components, root=0)
        if rank < number_of_components:
            while True:
                A = comm.recv(source=0, tag=MPI.ANY_TAG, status=status)
                tag = status.Get_tag()
                if tag == number_of_components + 10:
                    break
                m, n = A.shape
                if m > 1:
                    print('CPU {:04d}: Starting APD for {:d}-by-{:d} matrix (Time: {:.2f} seconds)'.format(rank, m, n, timeit.default_timer() - start_time))
                D = APD_recursive(np.array(A))
                comm.send(D, dest=0, tag=tag)