Пример #1
0
def process_query_results(query_results, base_map):

    in_shape_ids = []
    incoming_flow = {}
    out_shape_ids = []
    outgoing_flow = {}

    for itinerary in query_results:
        origin_id = Utils.convert_id(itinerary[0])
        destination_id = Utils.convert_id(itinerary[1])
        weight = itinerary[2]
        origin_ids = []
        destination_ids = []
        # we look only at shapes that are to be rendered
        if origin_id in base_map.shape_dict or destination_id in base_map.shape_dict:
            shape_origin = base_map.shape_dict[origin_id]
            origin_ids.append(origin_id)
            shape_destination = base_map.shape_dict[destination_id]
            destination_ids.append(destination_id)

        # We build a dictionary of outgoing traffic
        for origin_id in origin_ids:
            if origin_id not in out_shape_ids:
                out_shape_ids.append(origin_id)
                outgoing_flow[shape_origin] = []
            outgoing_flow[shape_origin].append((shape_destination, weight))
        # We build a dictionary of incoming traffic
        for destination_id in destination_ids:
            if destination_id not in in_shape_ids:
                in_shape_ids.append(destination_id)
                incoming_flow[shape_destination] = []
            incoming_flow[shape_destination].append((shape_origin, weight))

    return outgoing_flow, incoming_flow
Пример #2
0
    def get_shape_coords(self):

        shape_zone = self.shapefile.shape(self.shape_id)
        points = [(i[0], i[1]) for i in shape_zone.points]
        x_center, y_center = Utils.calculate_centroid(points)
        center = (x_center, y_center)
        max_bound, min_bound = Utils.calculate_boundaries(points)

        return (points, center, max_bound, min_bound)
Пример #3
0
    def find_max_coords(self):

        all_max_bound = []
        all_min_bound = []
        shape_dict = self.shape_dict
        for zone_id in shape_dict:
            zone_shape = shape_dict[zone_id]
            max_bound_zone = zone_shape.max_bound
            min_bound_zone = zone_shape.min_bound
            all_max_bound.append(max_bound_zone)
            all_min_bound.append(min_bound_zone)

        map_max_bound, unused_max = Utils.calculate_boundaries(all_max_bound)
        unused_min, map_min_bound = Utils.calculate_boundaries(all_min_bound)

        return (map_max_bound, map_min_bound)
Пример #4
0
    def get_user_articles(self, user_id):
        '''
        INPUT:
        user_id - (int) a user id
        user_item - (pandas dataframe) matrix of users by articles:
                    1's when a user has interacted with an article, 0 otherwise

        OUTPUT:
        article_ids - (list) a list of the article ids seen by the user
        article_names - (list) a list of article names associated with the list of article ids
                        (this is identified by the doc_full_name column in df_content)

        Description:
        Provides a list of the article_ids and article titles that have been seen by a user
        '''
        user_row = np.where(self.user_item.index == user_id)[0][0]
        user_articles = np.where(self.user_item.iloc[user_row] == 1)[0]
        article_ids = []

        for article in user_articles:
            article_id = self.user_item.iloc[:, article].name
            article_ids.append(str(article_id))  # to match the expected str type as output

        article_names = Utils.get_article_names(article_ids, self.interactions_df, 'title')

        return article_ids, article_names  # return the ids and names
Пример #5
0
    def project_shape_coords(self, projection):

        shape_zone = self.shapefile.shape(self.shape_id)
        points = [
            projection.apply_projection([i[0], i[1]])
            for i in shape_zone.points
        ]
        points = [projection.apply_translation([i[0], i[1]]) for i in points]
        self.points = points

        x_center, y_center = Utils.calculate_centroid(points)
        self.center = (x_center, y_center)

        max_bound, min_bound = Utils.calculate_boundaries(points)
        self.max_bound = max_bound
        self.min_bound = min_bound
Пример #6
0
    def make_user_user_recommendations(self, user_id, num_recommendations=10):
        '''
        INPUT:
        user_id - (int) a user id
        num_recommendations - (int) the number of recommendations you want for the user

        OUTPUT:
        recs - (list) a list of recommendations for the user by article id
        rec_names - (list) a list of recommendations for the user by article title

        Description:
        Loops through the users based on closeness to the input user_id
        For each user - finds articles the user hasn't seen before and provides them as recs
        Does this until m recommendations are found

        Notes:
        * Choose the users that have the most total article interactions
        before choosing those with fewer article interactions.

        * Choose articles with the articles with the most total interactions
        before choosing those with fewer total interactions.

        '''

        recs = []

        neighbors_df = self.get_top_sorted_users(user_id)
        user_articles_id, user_articles_names = self.get_user_articles(user_id)

        for neighbor in neighbors_df.index:
            neighbor_articles_id, neighbor_articles_names = self.get_user_articles(neighbor)
            sorted_neighbor_article_ids = Utils.get_top_articles_df(neighbor_articles_id, self.interactions_df)
            sorted_neighbor_article_ids = sorted_neighbor_article_ids.index.values
            article_not_read = np.setdiff1d(sorted_neighbor_article_ids, user_articles_id, assume_unique=True)
            article_not_read = [str(i) for i in article_not_read]
            recs = np.unique(np.concatenate([article_not_read, recs], axis=0))

            if len(recs) >= num_recommendations:
                break

        if len(recs) >= num_recommendations:
            recs = recs[:num_recommendations]

        recommended_articles = Utils.get_article_names(recs, self.interactions_df, 'title')

        return recommended_articles
Пример #7
0
    def make_SVD_recommendations(self, user_id, num_recommendations=10):
        preds = np.around(np.dot(np.dot(self.u_matrix, self.s_matrix), self.vt_matrix))
        articles_idx = preds.argsort()[-num_recommendations:][::-1]

        rec_ids = self.user_item.columns[articles_idx]
        recommended_articles = Utils.get_article_names(rec_ids, self.interactions_df, 'title')
        recommended_articles = recommended_articles[:num_recommendations]

        return recommended_articles
Пример #8
0
def process_query_results(query_results_dict, map_item):

    processed_query_results_dict = {}
    # we find the min and max passengers for the whole year
    min_passenger = 999999999
    max_passenger = 0
    for query_date in query_results_dict:
        temp_min, temp_max = Utils.compute_min_max_passengers(query_results_dict[query_date], 2)
        if temp_min < min_passenger:
            min_passenger = temp_min
        if temp_max > max_passenger:
            max_passenger = temp_max

    # we transform the query_results_dict to use instances of the PointOnMap class
    for query_date in query_results_dict:
        query_result = query_results_dict[query_date]
        processed_query_results_dict[query_date] = []
        for itinerary in query_result:
            processed_itinerary = []
            zone_id_origin = Utils.convert_id(itinerary[0])
            zone_id_destination = Utils.convert_id(itinerary[1])
            if zone_id_origin == zone_id_destination:
                color = (141, 91, 67)
            else:
                color = (135, 162, 34)

            weight = compute_weight(map_item[0], itinerary[2], max_passenger)

            shape_origin = map_item[1].shape_dict[zone_id_origin]
            coords = shape_origin.center
            point_to_render = classfile.PointOnMap(coords, weight, color)
            processed_itinerary.append(point_to_render)

            shape_dest = map_item[1].shape_dict[zone_id_destination]
            target_coords = shape_dest.center
            processed_itinerary.append(target_coords)

            processed_itinerary.append(weight)
            processed_query_results_dict[query_date].append(processed_itinerary)

    return processed_query_results_dict, min_passenger, max_passenger
Пример #9
0
def render_single_map(flow_dict, flow_dir, base_map, file_name, zone_shape):

    map_rendered = base_map.map_file.copy()
    zone_name = find_names(zone_shape, base_map)
    zone_id = Utils.convert_id(zone_shape.shape_id, inverse=True)
    map_title = '{}_{}_{}_{}_{}'.format(file_name[0], zone_id, zone_name,
                                        flow_dir, file_name[1])
    trips_list = flow_dict[zone_shape]
    min_passenger, max_passenger = Utils.compute_min_max_passengers(
        trips_list, 1)

    colors = []
    for linked_zone in trips_list:
        shape_to_color = linked_zone[0]
        if shape_to_color.shape_id != zone_shape.shape_id:
            weight = linked_zone[1]
            render_color = compute_color(weight, min_passenger, max_passenger)
            shape_to_color.color_fill = render_color
            if render_color not in colors:
                colors.append(render_color)
            shape_to_color.fill_in_shape(map_rendered)
            # we draw again the boundaries of the shape after filling it in
            pts = np.array(shape_to_color.points, np.int32)
            cv2.polylines(map_rendered, [pts], True, (255, 255, 255), 1,
                          cv2.LINE_AA)

    # outline the focused shape
    zone_shape.color_line = [95, 240, 255]
    zone_shape.line_thick = 3
    pts = np.array(zone_shape.points, np.int32)
    cv2.polylines(map_rendered, [pts], True, zone_shape.color_line,
                  zone_shape.line_thick, cv2.LINE_AA)
    # display the legend
    display_specific_text(map_rendered, zone_id, zone_name, flow_dir,
                          min_passenger, max_passenger, colors)

    # save the image
    cv2.imwrite(('{}.png').format(map_title), map_rendered)
Пример #10
0
    def create_test_and_train_user_item(self):
        '''
        INPUT:
        df_train - training dataframe
        df_test - test dataframe

        OUTPUT:
        user_item_train - a user-item matrix of the training dataframe 
                          (unique users for each row and unique articles for each column)
        user_item_test - a user-item matrix of the testing dataframe 
                        (unique users for each row and unique articles for each column)

        '''
        num_interactions = len(self.interactions_df)
        len_train = int(70*num_interactions/100)  # 70% of the df for train
        len_test = num_interactions - len_train  # 30% of the df for test
        df_train = self.interactions_df.head(len_train)
        df_test = self.interactions_df.tail(len_test)

        # we reuse the create_user_item_matrix we defined earlier
        user_item_train = Utils.create_user_item_matrix(df_train)
        user_item_test = Utils.create_user_item_matrix(df_test)

        return (user_item_train, user_item_test)
Пример #11
0
    def NLP_processing(self, text):
        # remove NaN from text
        text = Utils.remove_NaN(text, 0)

        # initialize count vectorizer object
        vect = CountVectorizer(lowercase=False, tokenizer=self.tokenize)
        # get counts of each token (word) in text data
        X = vect.fit_transform(self.content_analysis_target)

        # initialize tf-idf transformer object
        transformer = TfidfTransformer(smooth_idf=False)
        # use counts from count vectorizer results to compute tf-idf values
        tfidf = transformer.fit_transform(X)

        return tfidf
Пример #12
0
    def make_content_user_recommendations(self, _id, num_recommendations=10):
        '''
        INPUT:
        _id, the id of the user we want recommended articles for
        self.content_similarity_matrix, the similarity matrix of the articles, by default cosine matrix computed separately
        self.interactions_df, the dataframe with the interactions of users with articles
        self.article_content_df - the df containing details about the articles
        num_recommendations, the number of recommendations expected as an output, by default 10

        OUTPUT:
        recommended_articles, a list of recommended articles, given by name
        '''

        # get the articles a user read
        user_articles_id, user_articles_names = self.get_user_articles(_id)

        # filter out the articles that are not in the df of article details
        user_articles_id = [float(i) for i in user_articles_id]
        user_articles = self.article_content_df[self.article_content_df['article_id'].isin(user_articles_id)]['article_id'].values

        # sort the articles_id per number of interactions
        user_article_inter_dict = {}
        for article in user_articles:
            interact = len(self.interactions_df[(self.interactions_df['user_id'] == _id) & (self.interactions_df['article_id'] == article)])
            article_title = self.interactions_df[self.interactions_df['article_id'] == article]['title'].values[0]
            user_article_inter_dict[article] = {'num_interactions': interact, 'title': article_title}

        top_user_articles_df = pd.DataFrame.from_dict(user_article_inter_dict, orient='index')
        top_user_articles_df = top_user_articles_df.sort_values(by='num_interactions', ascending=False)

        # find similar articles in order
        recommended_articles = []
        for article in top_user_articles_df.index:
            articles_sim = self.find_similar_articles(article)
            unread_articles = np.setdiff1d(articles_sim, top_user_articles_df.index, assume_unique=True)
            for unread_article in unread_articles:
                if unread_article not in recommended_articles:
                    recommended_articles.append(unread_article)

            if len(recommended_articles) > num_recommendations:
                break

        recommended_articles = recommended_articles[:num_recommendations]
        recommended_articles = Utils.get_article_names(recommended_articles, self.article_content_df, 'doc_full_name')

        return recommended_articles
Пример #13
0
    def make_content_article_recommendations(self, _id, num_recommendations=10):
        '''
        INPUT:
        _id, the id of the article we want similar articles for
        self.content_similarity_matrix, the similarity matrix of the articles, by default cosine matrix computed separately
        self.interactions_df, the dataframe with the interactions of users with articles
        self.article_content_df - the df containing details about the articles
        num_recommendations, the number of recommendations expected as an output, by default 10

        OUTPUT:
        recommended_articles, a list of similar articles, given by name
        '''

        recommended_articles = self.find_similar_articles(_id)
        recommended_articles = recommended_articles[:num_recommendations]
        recommended_articles = Utils.get_article_names(recommended_articles, self.article_content_df, 'doc_full_name')

        return recommended_articles
Пример #14
0
 def __init__(self, interactions_df):
     self.interactions_df = interactions_df
     self.article_ids = self.interactions_df['article_id'].unique()
     self.top_articles_df = Utils.get_top_articles_df(self.article_ids, self.interactions_df)
Пример #15
0
def process_query_arg(render_animation_dict):
    period = render_animation_dict['period']
    query_dict = render_animation_dict['query_dict']
    database = render_animation_dict['database']
    specific_weekdays = query_dict['specific_weekdays']
    date = query_dict['date']
    aggregate_period = render_animation_dict['aggregate_period']
    weekdays = render_animation_dict['weekdays']

    query_results_dict = {}

    if aggregate_period is False and query_dict['date'] == 'loop_through_period':
        # in this case we want the result for each day of the period provided
        # if we have the flag loop_through_period in the query dict, it means the period
        # set for the query is multiple dates

        daterange = pd.date_range(period[0], period[1])

        # we run queries for each date in the daterange specified
        for single_date in daterange:
            date = pd.to_datetime(single_date)

            if specific_weekdays == 'on_specific_weekdays':

                # we check if the date of the daterange matches the weekday(s) we target
                if date.dayofweek in weekdays:
                    single_date = date.date().strftime('%Y-%m-%d')
                    query_dict['date'] = single_date
                    query = prepare_sql_query(query_dict)
                    query_results = Utils.make_sql_query(query, database)
                    query_results_dict[query_dict['date']] = query_results

                else:
                    # if a date in the range is not among the weekdays we want, we skip it
                    continue
            else:
                single_date = date.date().strftime('%Y-%m-%d')
                query_dict['date'] = single_date
                query = prepare_sql_query(query_dict)
                query_results = Utils.make_sql_query(query, database)
                query_results_dict[query_dict['date']] = query_results

    elif aggregate_period is True and query_dict['date'] == 'loop_through_period':
        # in this case, we want to aggregate the results (sum) per week
        daterange = pd.date_range(period[0], period[1])
        start_date = pd.to_datetime(period[0])
        end_date = pd.to_datetime(period[1])

        # let's build a list of all intervals we will want to aggregate the data for
        all_aggr_init = []
        start = start_date
        end = end_date

        # we add one list of dates per week to the list of all intervals
        i = 0
        for date in daterange:
            # we handle separately the first date of the period
            if i == 0:
                curr_week = [start.date().strftime('%Y-%m-%d')]

            if date != start_date and date != end_date:
                start_week_number = start.isocalendar()[1]
                date_week_number = date.isocalendar()[1]

                if date_week_number == start_week_number:
                    curr_week.append(date.date().strftime('%Y-%m-%d'))
                    i += 1
                else:
                    start = date
                    all_aggr_init.append(curr_week)
                    i = 0

        # we handle separately the last date of the period
        if curr_week not in all_aggr_init:
            curr_week.append(end_date.date().strftime('%Y-%m-%d'))
            all_aggr_init.append(curr_week)
        else:
            curr_week = [end_date.date().strftime('%Y-%m-%d')]
            all_aggr_init.append(curr_week)

        # now we keep only the first and last item of each interval

        all_aggr = []
        for interval in all_aggr_init:
            interval_new = [interval[0], interval[-1]]
            all_aggr.append(interval_new)

        # we now query for each interval
        for interval in all_aggr:
            query_dict['date'] = interval
            query = prepare_sql_query(query_dict)
            query_results = Utils.make_sql_query(query, database)
            query_results_dict[query_dict['date'][0]] = query_results

    else:
        # we have a single date to render for, so nothing to aggregate!
        # just in case we check that there is no mismatch between the single day and the
        # argument containing specific weekdays restrictions if any
        if specific_weekdays == 'on_specific_weekdays':

            # we check if the date of the daterange matches the weekday(s) we target
            date = pd.to_datetime(query_dict['date'])

            if date.dayofweek in weekdays:
                query = prepare_sql_query(query_dict)
                query_results = Utils.make_sql_query(query, database)
                query_results_dict[query_dict['date']] = query_results

            else:
                print("The date selected does not match the weekday(s) indicated. Please select either an interval ('time_granularity': 'period') or a valid weekday(s) list.")

        else:
            query = prepare_sql_query(query_dict)
            query_results = Utils.make_sql_query(query, database)
            query_results_dict[query_dict['date']] = query_results

    return query_results_dict
Пример #16
0
import os
import argparse
from utility import Utils

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Downloading ncodes for imagenet dataset')

    dir_path = os.path.realpath(os.path.dirname(__file__))
    path_to_save = os.path.realpath(os.path.dirname(__file__))

    optional = parser.add_argument_group('optional arguments')
    optional.add_argument('-nc_dir',
                          '--ncodes_dir',
                          default=path_to_save,
                          help='path to saved ncodes data')
    optional.add_argument('-url_data',
                          '--with_url_data',
                          default=False,
                          help='tag whether to download url data')

    args = parser.parse_args()
    util_obj = Utils()

    # download ncodes data csv
    util_obj.download_ncodes_image_net(path_to_save=args.ncodes_dir)

    # download imagenet urls csv
    args.url_data = eval(str(args.with_url_data))
    util_obj.download_image_net_urls(path_to_save=args.ncodes_dir)
Пример #17
0
import pandas as pd

from classfile import RecommendationEngine
from utility import Utils

# import and cleandata sources
interactions_df = pd.read_csv('data/user-item-interactions.csv')
article_content_df = pd.read_csv('data/articles_community.csv')
del interactions_df['Unnamed: 0']
del article_content_df['Unnamed: 0']

email_encoded = Utils.email_mapper(interactions_df['email'])
del interactions_df['email']
interactions_df['user_id'] = email_encoded

# create a matrix of user-article interactions
user_item = Utils.create_user_item_matrix(interactions_df)

# create an instance of the Recommendation Engine that can be used for multiple situations
rec_engine = RecommendationEngine(interactions_df, article_content_df,
                                  user_item)

# test the code for a few situations (expected returned output of 10 article titles)
# recommendations for an article
_id_type = 'article'
_id = 10
recommended_articles = rec_engine.make_recommendations(_id, _id_type)
print('Test article')
print(
    'The following articles are recommended based on your query for {} id {}:'.
    format(_id_type, _id))
Пример #18
0
    'weekdays': [],
    'aggregate_period': False
}

query_dict = build_query_dict(render_heat_map_dict)

if query_dict['date'] == 'loop_through_period':
    # if we have the flag loop_through_period in the query dict, it means the period
    # set for the query is multiple dates, therefore we want the query to return an
    # average on a time interval, and not on a single date
    period = render_heat_map_dict['period']
    daterange = pd.date_range(period[0], period[1])
    query_dict['date'] = period

query = prepare_sql_query(query_dict)
query_results = Utils.make_sql_query(query, database)

for single_map, base_map, projection in base_maps:
    # we process the query results
    outgoing_flow, incoming_flow = process_query_results(
        query_results, base_map)
    print('Rendering {}...'.format(single_map))
    if single_map == 'total':
        if time_granularity == 'weekdays_vs_weekends':
            file_name = ['NYC', '2018_diff_WD_WE']
        else:
            file_name = ['NYC', '2018']
    else:
        if time_granularity == 'weekdays_vs_weekends':
            file_name = ['{}'.format(single_map), '2018_diff_WD_WE']
        else:
                          default=True,
                          help='downloaded parallelly or sequentially')
    optional.add_argument(
        '-v',
        '--verbose',
        default=True,
        help='bool represent whether to display ncode level download stats')
    optional.add_argument(
        '-b',
        '--batch_size',
        default=None,
        help='number of images to download paralelly, if parallel is TRUE')

    args = parser.parse_args()
    args.parallel = eval(str(args.parallel))
    args.with_annotation = eval(str(args.with_annotation))
    args.verbose = eval(str(args.verbose))
    args.batch_size = eval(str(args.batch_size))

    # starting the download process
    util_obj = Utils()
    util_obj.subset_ncodes_to_download(ncodes_data_path)
    util_obj.download_partial_imagenet_dataset(
        path_to_url_dataset=args.url_data,
        path_to_annotations=args.annotations_dir,
        path_to_save_dataset=args.save_dir,
        only_annotations=args.with_annotation,
        parallel=args.parallel,
        verbose=args.verbose,
        batch_size_=args.batch_size)
Пример #20
0
    username, fname, lname = '', '', ''
    email = ''

    def __init__(self, username, fname, lname, *args, **kwargs):
        self.username = username
        self.fname, self.lname = fname, lname

    @classmethod
    def setEmail(cls, email):
        cls.email = f'{email}@g.net'

        return cls.email

    def setFullName(self):
        return f'{self.fname} {self.lname}'

    def __str__(self):
        return f'Hi {self.username}\nEmail:{self.setEmail(self.username)}\nFull Name:{self.setFullName()}'


usr = User(username='******', fname='John', lname='@Doe')
setmail = usr.setEmail(email='John')

assert Utils.remove_punctuation(usr.setEmail('john')) == 'johngnet'
assert Utils.remove_dollar_white_space(",12") == 12.0
assert Utils.remove_dollar_white_space("$123") == 123.0
assert Utils.remove_comma_and_spaces(" , ,1234") == 1234.0
my_str = "Hello!!!, he said ---and went."
assert Utils.remove_punctuation(my_str) == "Hello he said and went"

print('Assertion Test Complete')