예제 #1
0
def calculate_wins_per_team_per_season():
    """Get wins per season per team"""
    df_regular_season = get_table("t_original_regular_season_compact_results")
    df_wins_per_team_per_seaons = df_regular_season.groupby(
        ["season", "w_team_id"]).size().reset_index()
    df_wins_per_team_per_seaons.columns = ["season", "team_id", "wins"]
    write_table(df_wins_per_team_per_seaons, "wins_per_team_per_season")
def combine_datasets():

    poverty_table, _ = utils.read_table("poverty_data_clean.csv", True)
    crime_table, _ = utils.read_table("crime_data_clean.csv", True)
    rent_table, _ = utils.read_table("rent_data_no_dups.csv", True)
    combined_table = []
    combined_header = ["County", "State",\
        "Pov_Num_All","Pov_Pct_All","Median_Income", \
        "Crime_Rate_per_100000","Murder","Rape","Robbery","Aggravated_Assault","Burglary","Larceny","Vehicle_Theft","Arson",\
        "Population","Mean_Rent","Median_Rent", "Latitude", "Longitude"]

    for poverty_row in poverty_table:
        for crime_row in crime_table:
            for rent_row in rent_table:
                if poverty_row[2] == crime_row[0] and crime_row[0] == rent_row[
                        3]:
                    new_add = [poverty_row[2], poverty_row[1],\
                        poverty_row[3], poverty_row[4], poverty_row[9],\
                        crime_row[2], crime_row[3], crime_row[4], crime_row[5], crime_row[6], crime_row[7], crime_row[8], crime_row[9], crime_row[10], \
                        crime_row[11], rent_row[9], rent_row[10], rent_row[7], rent_row[8]]
                    combined_table.append(new_add)

    no_dups = dict(((x[0], x[1]), x) for x in combined_table)
    new_table = list(no_dups.values())

    new_table.insert(0, combined_header)
    utils.write_table("combined_data.csv", new_table)
예제 #3
0
def calculate_losses_per_team_per_season():
    """Get losses per season per team"""
    df_regular_season = get_table("t_original_regular_season_compact_results")
    df_losses_per_team_per_season = df_regular_season.groupby(
        ["season", "l_team_id"]).size().reset_index()
    df_losses_per_team_per_season.columns = ["season", "team_id", "losses"]
    write_table(df_losses_per_team_per_season, "losses_per_team_per_season")
예제 #4
0
def calculate_seed_rank_per_team_per_season():
    df_seed_rank_per_team_per_season = get_table(
        "t_original_ncaa_tourney_seeds")
    # strip beginning region and optional "a/b" (which might be of interest later on)
    df_seed_rank_per_team_per_season[
        "seed_rank"] = df_seed_rank_per_team_per_season["seed"].apply(
            lambda seed: int(seed[1:3]))
    df_seed_rank_per_team_per_season[
        "seed_region"] = df_seed_rank_per_team_per_season["seed"].apply(
            lambda seed: seed[0])
    df_seed_rank_per_team_per_season.drop("seed", axis=1, inplace=True)
    write_table(df_seed_rank_per_team_per_season,
                "seed_rank_region_per_team_per_season")
def normalize_combined():
    combined_table, header = utils.read_table("combined_data.csv", True)
    columns = []
    new_header = []
    for x in range(len(header)):
        if x not in [2, 6, 7, 8, 9, 10, 11, 12, 13, 16]:
            new_header.append(header[x])
            columns.append(utils.get_column(combined_table, x))
    columns.append([
        round(columns[6][i] * 12 * 100 / columns[3][i], 1)
        for i in range(len(columns[0]))
    ])
    new_header.append("Pct_Income_as_Rent")

    columns[2] = normalize_data(columns[2])  # Poverty
    columns[3] = normalize_data(columns[3])  # Median Income
    columns[4] = discretize_data(columns[4], 5)  # Crime Rate
    columns[5] = normalize_data(columns[5])  # Population
    columns[6] = normalize_data(columns[6])  # Rent
    columns[7] = normalize_data(columns[7])  # Rent as percent of income.

    new_table = []
    for x in range(len(columns[0])):
        buffer = []
        for column in columns:
            buffer.append(column[x])
        new_table.append(buffer)

    new_table.insert(0, new_header)
    utils.write_table("combined_data_normalized.csv", new_table)

    columns[2] = discretize_data(columns[2], 3)  # Poverty
    columns[3] = discretize_data(columns[3], 3)  # Median Income
    #columns[4] = discretize_data(columns[4], 3) # Crime Rate
    columns[5] = discretize_data(columns[5], 5)  # Population
    columns[6] = discretize_data(columns[6], 3)  # Rent
    columns[7] = discretize_data(columns[7], 5)  # Rent as percent of income.

    new_table = []
    for x in range(len(columns[0])):
        buffer = []
        for column in columns:
            buffer.append(column[x])
        new_table.append(buffer)

    new_table.insert(0, new_header)
    utils.write_table("combined_data_discretized.csv", new_table)
def clean_rent():
    table = []
    infile = open("rent_data.csv", "r")

    lines = infile.readlines()
    for line in lines:
        newline = line.strip()  #removes whitespace characters
        values = newline.split(",")  #splits on comma
        #utils.convert_to_float(values)
        if values[2] != "HI" and values[2] != "AK":
            table.append(values)

    utils.write_table("rent_data_clean.csv", table)
    no_dups = dict(((x[1], x[3]), x) for x in table)
    new_table = list(no_dups.values())
    utils.write_table("rent_data_no_dups.csv", new_table)
    infile.close()
예제 #7
0
def calculate_wins_per_team_per_season_by_ot():
    """Get regular season wins split (binary) by OT"""
    df_regular_season = get_table("t_original_regular_season_compact_results")

    # Aggregate
    df_wins_per_team_per_seaons_no_ot = \
    df_regular_season[df_regular_season["num_ot"] == 0]\
    .groupby(["season","w_team_id"]).size().reset_index()

    # Cosmetics
    df_wins_per_team_per_seaons_no_ot.rename(columns={
        "w_team_id": "team_id",
        0: "wins_no_ot"
    },
                                             inplace=True)

    # Aggregate
    df_wins_per_team_per_seaons_ot = \
    df_regular_season[df_regular_season["num_ot"] > 0]\
    .groupby(["season","w_team_id"]).size().reset_index()

    # cosmetics
    df_wins_per_team_per_seaons_ot.rename(columns={
        "w_team_id": "team_id",
        0: "wins_ot"
    },
                                          inplace=True)

    # join outer(!) to include teams that never or only won via OT
    df_wins_per_team_per_seaons_by_ot = \
    pd.merge(
        df_wins_per_team_per_seaons_no_ot,
        df_wins_per_team_per_seaons_ot,
        on=["season", "team_id"],
        how="outer"
    )

    # cosmetics
    df_wins_per_team_per_seaons_by_ot.fillna(0, inplace=True)
    df_wins_per_team_per_seaons_by_ot[
        "wins_ot"] = df_wins_per_team_per_seaons_by_ot["wins_ot"].astype(int)
    df_wins_per_team_per_seaons_by_ot[
        "wins_no_ot"] = df_wins_per_team_per_seaons_by_ot["wins_no_ot"].astype(
            int)
    write_table(df_wins_per_team_per_seaons_by_ot,
                "wins_per_team_per_season_by_ot")
def clean_poverty():
    table = []
    infile = open("poverty_data.csv", "r")

    lines = infile.readlines()
    for line in lines:
        newline = remove_quotes(line)
        newline = newline.strip()  #removes whitespace characters
        values = newline.split(",")  #splits on comma
        #utils.convert_to_float(values)
        if values[1] != "HI" and values[1] != "AK":
            table.append(values)

    header = table.pop(0)
    header[0] = header[0][3:]
    table.insert(0, header)
    utils.write_table("poverty_data_clean.csv", table)
    infile.close()
예제 #9
0
def calculate_mean_stats_per_team_per_season():
    df_detailed_results = get_table("t_original_ncaa_tourney_detailed_results")
    df_results_winner = df_detailed_results[[
        'season', 'w_team_id', 'w_score', 'wfgm', 'wfga', 'wfgm3', 'wfga3',
        'wftm', 'wfta', 'wor', 'wdr', 'w_ast', 'wto', 'w_stl', 'w_blk', 'wpf'
    ]]
    df_results_loser = df_detailed_results[[
        'season', 'l_team_id', 'l_score', 'lfgm', 'lfga', 'lfgm3', 'lfga3',
        'lftm', 'lfta', 'lor', 'ldr', 'l_ast', 'lto', 'l_stl', 'l_blk', 'lpf'
    ]]
    df_results_winner.columns = map(lambda x: x.lstrip("w_"),
                                    df_results_winner.columns)
    df_results_loser.columns = map(lambda x: x.lstrip("l_"),
                                   df_results_loser.columns)
    df_mean_stats_per_team_per_season =\
        df_results_winner.append(df_results_loser).groupby(["season", "team_id"]).mean().reset_index()
    write_table(df_mean_stats_per_team_per_season,
                "mean_stats_per_team_per_season")
예제 #10
0
def calculate_mean_score_per_team_per_season():
    """Get the average score per team per season"""
    pd = get_table("t_original_regular_season_compact_results")
    # cover case team == winner
    df_scores_winner = pd[["season", "w_team_id", "w_score"]]
    df_scores_winner.columns = ["season", "team_id", "score"]
    # cover case team == loser
    df_scores_looser = pd[["season", "l_team_id", "l_score"]]
    df_scores_looser.columns = ["season", "team_id", "score"]
    # combine winner & loser frames
    df_scores_teams = df_scores_winner.append(df_scores_looser)
    df_mean_scores_per_team_per_season = df_scores_teams.groupby(
        ["season", "team_id"])["score"].mean().reset_index()
    df_mean_scores_per_team_per_season.columns = [
        "season", "team_id", "score_avg"
    ]
    write_table(df_mean_scores_per_team_per_season,
                "mean_score_per_team_per_season")
예제 #11
0
def calculate_ncaa_losses_per_team_by_ot():
    df_ncaa = get_table("t_original_ncaa_tourney_compact_results")

    # Aggregate
    df_losses_per_team_historic_ncaa_no_ot =\
    df_ncaa[df_ncaa["num_ot"] == 0].groupby("l_team_id").size().reset_index()

    # Cosmetics
    df_losses_per_team_historic_ncaa_no_ot.rename(columns={
        "l_team_id": "team_id",
        0: "losses_no_ot"
    },
                                                  inplace=True)

    # Aggregate
    df_losses_per_team_historic_ncaa_ot =\
    df_ncaa[df_ncaa["num_ot"] > 0].groupby("l_team_id").size().reset_index()

    # cosmetics
    df_losses_per_team_historic_ncaa_ot.rename(columns={
        "l_team_id": "team_id",
        0: "losses_ot"
    },
                                               inplace=True)

    df_losses_per_team_historic_ncaa_by_ot = \
    pd.merge(
        df_losses_per_team_historic_ncaa_no_ot,
        df_losses_per_team_historic_ncaa_ot,
        on=["team_id"],
        how="outer"
    )

    # cosmetics
    df_losses_per_team_historic_ncaa_by_ot.fillna(0, inplace=True)
    df_losses_per_team_historic_ncaa_by_ot[
        "losses_ot"] = df_losses_per_team_historic_ncaa_by_ot[
            "losses_ot"].astype(int)
    df_losses_per_team_historic_ncaa_by_ot[
        "losses_no_ot"] = df_losses_per_team_historic_ncaa_by_ot[
            "losses_no_ot"].astype(int)
    write_table(df_losses_per_team_historic_ncaa_by_ot,
                "ncaa_losses_per_team_by_ot")
def clean_crime():
    table = []
    infile = open("crime_data.csv", "r")

    lines = infile.readlines()
    header = lines.pop(0).strip().split(",")
    header.insert(1, "State_Name")
    table.insert(0, header)

    for line in lines:
        newline = remove_quotes(line)
        newline = newline.strip()  #removes whitespace characters
        values = newline.split(",")  #splits on comma
        values.insert(1, values[0][-2:])
        values[0] = values[0][:-3]
        #utils.convert_to_float(values)
        if values[1] != "HI" and values[1] != "AK":
            table.append(values)

    utils.write_table("crime_data_clean.csv", table)
    infile.close()
def singular_value_decomposition_pp(data, latent_factors_size, epochs):
    """

        Based on the code available in:

            https://github.com/cheungdaven/recommendation

        Based on the paper of:

            https://people.engr.tamu.edu/huangrh/Spring16/papers_course/matrix_factorization.pdf


    """
    random.seed()

    users_items, users, items = data_treatment.retrieve_guide_features(
        data['Historic Data'])

    matrix_users_items = data_treatment.mount_matrix_user_item(users_items)

    ratings_mean = utils.measure_average_rating(data['Historic Data'])

    # a matrix users x items
    historic_rating_matrix = model.generate_historic_data_matrix(
        data['Historic Data'], 'users', users, items, ratings_mean)

    #users_mean = utils.measure_row_mean(historic_rating_matrix)

    #historic_rating_matrix = utils.subtraction_matrix_row_mean(historic_rating_matrix, users_mean)

    # users latent matrix
    p_matrix = algebric_operations.generate_random_matrix(
        latent_factors_size, len(users))

    # itens latent matrix
    q_matrix = algebric_operations.generate_random_matrix(
        latent_factors_size, len(items))

    # prediction matrix
    y_matrix = algebric_operations.generate_random_matrix(
        len(items), latent_factors_size)

    ratings = calculate_first_estimation(users, users_items,
                                         latent_factors_size, y_matrix, items)

    residual_items = [random.uniform(0, 1) for item in range(0, len(items))]

    residual_users = [random.uniform(0, 1) for user in range(0, len(users))]

    for epoch in range(epochs):

        for row in matrix_users_items:

            user, item = row[0], row[1]

            user_index, item_index = users[user], items[item]

            amount_items = len(users_items[user])

            # diving all the values of a a array by the sqrt of the users amount of items
            ratings[user] = list(
                map(lambda value: value / math.sqrt(amount_items),
                    ratings[user]))

            # retriving all the values of a specific column
            column_array = retrieve_column(p_matrix, users[user])

            ratings[user] = algebric_operations.sum_two_arrays(
                ratings[user], column_array)

            predicted_rating = ratings_mean + residual_items[
                item_index] + residual_users[user_index] + svd_prediction(
                    ratings[user], retrieve_column(q_matrix, item_index))

            measured_error = historic_rating_matrix[user_index][
                item_index] - predicted_rating  # error_metric(historic_rating_matrix[users[user]][item_index], predicted_rating)

            # cost O(n)
            p_matrix = _update_p_matrix(p_matrix, q_matrix, user_index,
                                        item_index, measured_error)

            # cost O(n)
            q_matrix = _update_q_matrix(q_matrix, p_matrix, user_index,
                                        item_index, user, amount_items,
                                        ratings, measured_error)

            # reconstruction matrix - this will be the closest to the original matrix - cost O(n**2)
            y_matrix = _update_y_matrix(y_matrix, q_matrix, users_items, user,
                                        items, amount_items, measured_error)

            # cost O(1)
            residual_items = _update_residual_items(residual_items, item_index,
                                                    measured_error)

            # cost O(1)
            residual_users = _update_residual_users(residual_users, user_index,
                                                    measured_error)

        print(
            svd_rmse(historic_rating_matrix, matrix_users_items, users, items,
                     ratings_mean, residual_users, residual_items, ratings,
                     q_matrix, y_matrix, users_items, latent_factors_size))

    predictions = make_prediction(historic_rating_matrix,
                                  data['Prediction Data'], ratings,
                                  ratings_mean, users, items, q_matrix,
                                  residual_users, residual_items, users_items,
                                  y_matrix, latent_factors_size)

    for index, prediction in enumerate(predictions):

        data['Prediction Data'][index].append(str(prediction))

    data['Prediction Data'].insert(0, ['UserId', 'ItemId', 'Prediction'])

    utils.write_table(data['Prediction Data'], "Outputs/predictions.txt")
예제 #14
0
    def tick_attendence(self,img_name = None,save_annotated = True,add_vector = True,n_neighbors = 1):
        global Label_test
        global location_list
        global vector_list
        global mode
        def line_select_callback(click,release):
            global Label_test
            global location_list
            global target
            row1,col2,row2,col1 = int(click.ydata),int(release.xdata),int(release.ydata),int(click.xdata)
            Label_test.append(int(target))
            location_list.append((row1,col2,row2,col1))
            print(special_layout(f"Added {self.dict_[str(target)]['name']} to annotated image.\n\
Amount of targets: {len(location_list)}"))
            plt.close()

        def onclick(event):
            global Label_test
            global location_list
            global vector_list
            global mode

            col, row = event.xdata, event.ydata
            for i in range(len(location_list)):
                    row1,col2,row2,col1 = location_list[i] 
                    if row > row1 and row < row2 and col > col1 and col < col2:
                        if mode == '2':
                            try:
                                correction = input(special_layout(f"You select {self.dict_[str(Label_test[i])]['name']} ({Label_test[i]})\n***correction -> 1 No correction -> 0")) 
                                if int(correction):

                                        correct_label = input(special_layout(f"Who is this? :\n\n\
{dict_5row_layout(self.dict_,'name',blank = 15,each_row =5,count_value = False)}\n***Please type number"))
                                        print(special_layout(f"{self.dict_[str(Label_test[i])]['name']} -> {self.dict_[str(correct_label)]['name']}"))
                                        Label_test[i] = correct_label
                            except:
                                pass
                            break
                        elif mode == '3':
                            try:
                                delete = input(special_layout(f"You confirm to delete {self.dict_[str(Label_test[i])]['name']} ({Label_test[i]}) on [{row1}:{row2},{col1}:{col2}]?\n***yes -> 1 No no -> 0"))
                            except:
                                pass
                            break
            try:
                if int(delete):
                    Label_test.pop(i)
                    location_list.pop(i)
                    vector_list.pop(i)
            except:
                pass
            plt.close()

        def toggle_selector(event):
                toggle_selector.RS.set_active(True)

        def object_mode_change(event):
            global target
            global mode
            if event.key == '1':
                mode = '1'
                print(special_layout(f" Add label on annotated image."))
                try:
                    target = input(special_layout(f"Select target label:\n\n\
{dict_5row_layout(self.dict_,'name',blank = 15,each_row =5,count_value = False)}"))
                    print(f"You will annotate {self.dict_[str(target)]['name']} ({target})")
                    plt.close()
                except:
                    pass
            elif event.key == '2':
                mode = '2'
                print(special_layout(f" Change label on annotated image."))
                plt.close()

            elif event.key == '3':
                mode = '3'
                print(special_layout(f" Delete label on annotated image."))
                plt.close()

            elif event.key == 'q':
                mode = 'q'
                print(special_layout(f"Finish correction..."))
                plt.close()
            
            else:
                print(special_layout(f"Please press the followings key:\n\nAdd annotation -> 1\nClick show label and change label -> 2\nDelete annotation -> 3\n\
Exit correction -> q"))

        # change name
        if img_name == None:
            try:
                img_name = change_image_name(self.classname)
            except:
                img_name = [path for path in sorted(Path(f"./data/{self.classname}/image/class").glob("*.jpg"))][-1].name

        img_path = f'./data/{self.classname}/image/class/{img_name}'

        print(special_layout(f"Detect and encode faces on image({img_name})"))
        array = load_image(img_path)
        location_list, vector_list = face_location_encoding(array)
        Label_test = list(face_prediction(self.classname, vector_list))
        annotated = draw_box(load_image(img_path), location_list,False,Label_test ,self.dict_)
        count = 0
        while True:
            print(special_layout(f"Show you the annotated image...\nEnter q"))
            fig, ax = plt.subplots(1)
            plt.imshow(annotated)
            plt.show()
            if count%2==0:
                indivdual = input(special_layout(f"Try individual model?\n1:yes 0:no"))
                if int(indivdual):
                    Label_test = list(face_prediction(self.classname, vector_list,only_individual=True))
                    annotated = draw_box(load_image(img_path), location_list,False,Label_test ,self.dict_)
                else:
                    break
            else:
                back = input(special_layout(f"Try the previous model?\n1:yes 0:no"))
                if int(back):
                    Label_test = list(face_prediction(self.classname, vector_list))
                    annotated = draw_box(load_image(img_path), location_list,False,Label_test ,self.dict_)
                    plt.close()
                else:
                    print(123)
                    break
            count += 1
        print(special_layout(f"Show you the annotated image...\nPress H to watch instruction:)"))
        mode = '2'
        while True:
            annotated = draw_box(load_image(img_path), location_list,False,Label_test ,self.dict_)
            fig, ax = plt.subplots(1)
            plt.imshow(annotated)

            if mode == '1':
                toggle_selector.RS = RectangleSelector(
                    ax,line_select_callback,
                    drawtype='box',useblit=True,
                    button=[1],minspanx=5,minspany=5,
                    spancoords='pixels',interactive=True
                    )

                plt.connect('key_press_event', toggle_selector)
                plt.connect('key_press_event',object_mode_change)

            elif mode == '2' or mode == '3':
                Cursor(ax,
                horizOn=False, # Controls the visibility of the horizontal line
                vertOn=False, # Controls the visibility of the vertical line
                )
                fig.canvas.mpl_connect('button_press_event', onclick)
                plt.connect('key_press_event',object_mode_change)
            
            plt.show()
            if mode == 'q':
                break
                
        if save_annotated:
            create_annotated_dir(self.classname)
            plt.imsave(img_path.replace('class','annotated'),annotated)
        # write table
        write_table(self.dict_,self.classname,Label_test,img_name)

        # add vector # Modelling
        if add_vector:
            vector_correct = input(special_layout(f"Add all face into our knn model?\n***yes -> 1 no -> 0"))
            if int(vector_correct)-1:
                mode = '2'
                while True:
                    annotated = draw_box(load_image(img_path), location_list,False,Label_test ,self.dict_)
                    fig, ax = plt.subplots(1)
                    plt.imshow(annotated)

                    if mode == '1':
                        toggle_selector.RS = RectangleSelector(
                            ax,line_select_callback,
                            drawtype='box',useblit=True,
                            button=[1],minspanx=5,minspany=5,
                            spancoords='pixels',interactive=True
                            )

                        plt.connect('key_press_event', toggle_selector)
                        plt.connect('key_press_event',object_mode_change)

                    elif mode == '2' or mode == '3':
                        Cursor(ax,
                        horizOn=False, # Controls the visibility of the horizontal line
                        vertOn=False, # Controls the visibility of the vertical line
                        )
                        fig.canvas.mpl_connect('button_press_event', onclick)
                        plt.connect('key_press_event',object_mode_change)
                    
                    plt.show()
                    if mode == 'q':
                        break

            add_vector_location_img(self.dict_,self.classname,vector_list,Label_test,location_list,img_name)
            vector_train=[]
            label_train=[]
            print(special_layout(f"Vector amount summary:"))
            print(col_layout('Label','Vector(individual) amount','Vector(class) amount','Total'))
            for label,each_dict in self.dict_.items():
                total = len(each_dict['vector(individual)'])+len(each_dict['vector(class)'])
                print(col_layout(str(label)+'.'+each_dict['name'],len(each_dict['vector(individual)']),len(each_dict['vector(class)']),total))
                vector_train = vector_train + each_dict['vector(individual)']+each_dict['vector(class)']
                label_train += [int(label) for i in range(total)]

            knn_modelling(self.classname,vector_train,label_train,n_neighbors =n_neighbors)

        # Final: output to dir
        with open(json_path,'w') as doc:
            doc.write(json.dumps(self.dict_))

        # reminder
        print(special_layout(f'Renew label dictionary: {json_path}'))
예제 #15
0
def calculate_ncaa_losses_per_team():
    """Get all NCAA wins per team"""
    df_ncaa = get_table("t_original_ncaa_tourney_compact_results")
    df_ncaa_losses_per_team = df_ncaa.groupby("l_team_id").size().reset_index()
    df_ncaa_losses_per_team.columns = ["team_id", "losses"]
    write_table(df_ncaa_losses_per_team, "ncaa_losses_per_team")
예제 #16
0
파일: test.py 프로젝트: fengkaicnic/pyml
                    arg_map = {'name': nick}
                rst = sales_solr.sales_search(arg_map, page_index, countofpage, solr_ip_port)
                print rst
                lines = []
                lines.append(subject.strip())
                lines.append(arg_map['name'].strip())
                lines.append(str(rst[0]))
                if rst[0]:
                    total += 1
                num += 1
                result = parse_eml(msg)
                # pdb.set_trace()
                lines.append(result.get(u'联系人', '').strip())
                lines.append(result.get(u'手机', '').strip())
                lines.append(result.get(u'座机', '').strip())
                lines.append(result.get(u'地址', '').strip())
                lines.append(result.get('email', '').strip())
                linecsv.append(','.join(lines))
                # print '*****************************************'
        except:
            pass
    print total
    print num
    with open('d:/naren/test.csv', 'wb') as file:
        file.writelines('\n'.join(linecsv).encode('utf8'))
    emlutils.write_table(linecsv)
    server.quit()
end = time.time()

print (end - start)
예제 #17
0
def non_negative_matrix_factorization(data,
                                      latent_factors_size,
                                      epochs,
                                      output_file=None,
                                      test=False):
    """

        Based on the code available in:

            https://github.com/cheungdaven/recommendation/blob/master/recSysNMF.py

        We also use as guide:


            https://blog.acolyer.org/2019/02/18/the-why-and-how-of-nonnegative-matrix-factorization/
            Class 08 - Collaborative Filtering: Factorization Matrix

    """
    random.seed()

    users_items, users, items, users_ratings, items_ratings = data_treatment.retrieve_guide_features(
        data['Historic Data'])

    ratings_mean = utils.measure_average_rating(data['Historic Data'])

    # users latent matrix
    p_matrix = algebric_operations.generate_random_matrix(
        latent_factors_size, len(users))

    # itens latent matrix
    q_matrix = algebric_operations.generate_random_matrix(
        latent_factors_size, len(items))

    epochs_rmse = []

    for epoch in range(epochs):

        for row in data['Historic Data']:

            user, item, historic_rating = row[0], row[1], row[2]

            user_index, item_index = users[user], items[item]

            error = float(historic_rating) - (
                users_ratings[user] + items_ratings[item] +
                nmf_prediction(retrieve_column(p_matrix, user_index),
                               retrieve_column(q_matrix, item_index))) / 3

            p_matrix = _update_matrixes(p_matrix, q_matrix, user_index,
                                        item_index, error)

            q_matrix = _update_matrixes(q_matrix, p_matrix, item_index,
                                        user_index, error)

        epochs_rmse.append(
            measure_rmse(data['Historic Data'], p_matrix, q_matrix, users,
                         items, users_ratings, items_ratings))

        print(epoch, epochs_rmse[-1])

    predictions = make_prediction(data['Prediction Data'], p_matrix, q_matrix,
                                  ratings_mean, users, items, users_ratings,
                                  items_ratings)

    if test:

        return predictions, epochs_rmse

    for index, prediction in enumerate(predictions):

        data['Prediction Data'][index].append(str(prediction))

    data['Prediction Data'].insert(0, ['UserId', 'ItemId', 'Prediction'])

    utils.write_table(data['Prediction Data'], output_file)
예제 #18
0
def main():
    table = utils.read_table("auto-data-clean.txt")
    for row in table:
        del row[-2]

    utils.write_table("auto-data-no-names.txt", table)