Exemplos de import_json_to_dataframe em Python, exemplos de ETL.helper.import_json_to_dataframe em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: auxiliary_tables.py Projeto: tillhoffmann1411/board-game-recommender

def normalize_bga_game_categories_relation():
    # import bga_games_category_relation:
    categories_bga_games_relation_path_fuzzy = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_*.json'
    categories_bga_games_relation_path = get_latest_version_of_file(
        categories_bga_games_relation_path_fuzzy)
    categories_bga_games_relation_df = import_json_to_dataframe(
        categories_bga_games_relation_path)

    # import bga categories:
    categories_bga = pd.read_csv(
        '../Data/BoardGameAtlas/Raw/API/categories/all_bga_categories.csv',
        index_col=0)

    # import game keys:
    game_keys = pd.read_csv(
        '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv',
        index_col=0)

    # join games_category_relation table to replace bga_game_id column with game_keys column:
    categories_bga_games_relation_df = pd.merge(
        left=categories_bga_games_relation_df,
        right=game_keys,
        left_on='game_id',
        right_on='bga_game_id')

    # normalize by only keeping game_key and category_id
    categories_bga_games_relation_df = categories_bga_games_relation_df[[
        'game_key', 'category_id'
    ]]

    # export df
    export_path = '../Data/BoardGameAtlas/Processed/API/05_BGA_Game_Categories_Relation_Cleaned.csv'
    export_df_to_csv(categories_bga_games_relation_df, export_path)

Exemplo n.º 2

0

Exibir arquivo

def get_ids_of_included_games():
    # check if file already exists:
    filename = '../Data/BoardGameAtlas/Processed/API/BGA_Ids_of_boardgames_included.json'

    # if file doesn't exist call function to create it:
    if not os.path.isfile(filename):
        create_id_list_of_included_bga_games()

    # import data
    ids_df = import_json_to_dataframe(filename)
    ids_list = ids_df.iloc[:, 0].tolist()

    return ids_list

Exemplo n.º 3

0

Exibir arquivo

Arquivo: auxiliary_tables.py Projeto: tillhoffmann1411/board-game-recommender

def merge_scraped_bga_designer_data_and_api_data():
    # import scraped bga designer data
    import_path_1 = '../Data/BoardGameAtlas/Raw/Scrapy/designers/bga_designers.json'
    designers_scrapy = import_json_to_dataframe(import_path_1)

    # import api bga designer data
    import_path_2_fuzzy = '../Data/BoardGameAtlas/Processed/API/03_BGA_Game_designers_Relation_*.json'
    import_path_2 = get_latest_version_of_file(import_path_2_fuzzy)
    designers_game_relation = import_json_to_dataframe(import_path_2)

    # remove list around designer url:
    designers_scrapy = designers_scrapy.explode('designer_bga_image_url')

    # merge both dataframes:
    designers_merged = pd.merge(left=designers_scrapy,
                                right=designers_game_relation,
                                left_on='designer_url',
                                right_on='designer_url')

    # export df
    export_path = '../Data/BoardGameAtlas/Processed/API/bga_designers_scrapy_and_api_data_merged.csv'
    export_df_to_csv(designers_merged, export_path)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: auxiliary_tables.py Projeto: tillhoffmann1411/board-game-recommender

def create_list_of_all_bga_designers():
    # import bga designers
    fuzzy_import_path = '../Data/BoardGameAtlas/Processed/API/03_BGA_Game_designers_Relation*.json'
    import_path = get_latest_version_of_file(fuzzy_import_path)
    bga_designers_game_relation = import_json_to_dataframe(import_path)

    # extract designers ids and designer urls
    designers = bga_designers_game_relation[['designer_id', 'designer_url']]

    # keep only unique designers:
    designers.drop_duplicates(subset='designer_id', keep='first', inplace=True)

    # export designers to csv:
    export_path = '../Data/BoardGameAtlas/Processed/API/BGA_All_Unique_designers.csv'
    export_df_to_csv(designers, export_path)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: auxiliary_tables.py Projeto: tillhoffmann1411/board-game-recommender

def integrate_game_name_relation_tables():
    # Import BGA game_name_relation table:
    fuzzy_import_path_1 = '../Data/BoardGameAtlas/Processed/API/06_BGA_Game_Names_Relation_*.json'
    import_path_1 = get_latest_version_of_file(fuzzy_import_path_1)
    names_bga = import_json_to_dataframe(import_path_1)

    # Import BGG game_name_relation table:
    fuzzy_import_path_2 = '../Data/BoardGameGeeks/Processed/GameInformation/06_BGG_Game_Name_Relation_*.csv'
    import_path_2 = get_latest_version_of_file(fuzzy_import_path_2)
    names_bgg = pd.read_csv(import_path_2, index_col=0)

    # Import Game keys:
    import_path_3 = '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv'
    game_keys = pd.read_csv(import_path_3, index_col=0)

    # Replace bga 'game_id' with 'game_key'
    names_bga = pd.merge(left=names_bga,
                         right=game_keys,
                         left_on='game_id',
                         right_on='bga_game_id')
    names_bga = names_bga[['game_key', 'game_name']]

    # Replace bgg 'game_id' with 'game_key'
    names_bgg = pd.merge(left=names_bgg,
                         right=game_keys,
                         left_on='bgg_game_id',
                         right_on='bgg_game_id')
    names_bgg = names_bgg[['game_key', 'game_name']]

    # Merge both dataframes:
    names_combined = pd.concat([names_bga, names_bgg]).sort_values('game_key')

    # Remove duplicates:
    print(
        'Number of duplicate game names in GameNameTranslation table found and dropped: '
        + str(len(names_combined) - len(names_combined.drop_duplicates())))
    names_combined.drop_duplicates(inplace=True)

    # Drop duplicates:
    names_combined.drop_duplicates(inplace=True)

    # Export result:
    export_path = '../Data/Joined/Results/GameNameTranslation.csv'
    export_df_to_csv(names_combined, export_path)

Exemplo n.º 6

0

Exibir arquivo

def create_id_list_of_included_bga_games():
    """
    Extracts ids of bga games previously obtained from bga api and create a JSON file containing the IDs.
    This list will later be used for the BGA Review API requests.
    """

    # import file that contains information on bga_games
    # since the exact filename is unknown, we have to find file and its latest version first:
    filename = get_latest_version_of_file(
        '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json')

    data = import_json_to_dataframe(filename)

    # extract ids
    ids = data['bga_game_id']

    # export data to json file
    filename_export = '../Data/BoardGameAtlas/Processed/API/BGA_Ids_of_boardgames_included.json'
    export_df_to_json(ids, filename_export)

Exemplo n.º 7

0

Exibir arquivo

def clean_bga_game_information_scraper():
    '''
    Function that cleans data collected by the BoardGameInfoSpider.
    Dataframe is then saved to a JSON file.
    '''

    df = import_json_to_dataframe(
        '../Data/BoardGameAtlas/Raw/Scrapy/BoardGameInformationScraper.json')

    # remove unwanted symbols (especially brackets)
    unwanted_symbols = ['[', ']', "'"]
    for col in df.columns:
        df[col] = df[col].astype(str)
        for symbol in unwanted_symbols:
            df[col] = df[col].str.replace(symbol, '')

    # remove white spaces at beginning and end of strings
    df.columns = df.columns.str.strip()

    # clean gameID "game-foRKR22fGQ" -> "foRKR22fGQ"
    df['bga_game_id'] = df['bga_game_id'].str.split('game-').str[1]

    # remove "Rank:" from rank: "Rank: 1" -> "1"
    df['rank'] = df['rank'].str.replace("Rank:", '')

    # split num_players_and_play_time
    num_players_and_play_time = df.num_players_and_play_time.str.split(
        ",",
        expand=True,
    )
    df[['num_players', 'play_time']] = num_players_and_play_time
    df.drop('num_players_and_play_time', axis=1, inplace=True)

    # remove all duplicates
    print('Number of duplicates removed: ' + str(
        len(df) - len(df.drop_duplicates(subset='bga_game_id', keep="first"))))
    df = df.drop_duplicates(subset='bga_game_id', keep="first")

    # create json file
    export_df_to_json(
        df,
        '../Data/BoardGameAtlas/Processed/bga_GameInformation_scrapy_Cleaned.json'
    )

Exemplo n.º 8

0

Exibir arquivo

def clean_bga_api_review_data():

    filename = '../Data/BoardGameAtlas/Processed/API/bga_all_reviews_for_games_with_more_than_2_reviews.json'
    # first check if file already exists:
    # if file doesn't exist call function to create it:
    if not os.path.isfile(filename):
        gather_bga_api_review_data()

    # import data
    df = import_json_to_dataframe(
        '../Data/BoardGameAtlas/Processed/API/bga_all_reviews_for_games_with_more_than_2_reviews.json'
    )

    # remove reviews with a rating < 1: these ratings are actually errors since the rating scale goes from 1 - 5
    # and not 0 - 5 when rating games on boardgameatlas.com.
    df = df[df['rating'] >= 1]

    # adjust ratings so that they correspond to a scale from 1-10 instead of 1-5:
    # a BGA rating of 1 should result in a 1 on the new scale:  1 * 2.25 - 1.25 =  2.25 - 1.25 =  1
    # a BGA rating of 5 should result in a 10 on the new scale: 5 * 2.25 - 1.25 = 11.25 - 1.25 = 10
    df['rating'] = 2.25 * df['rating'] - 1.25

    # drop column review_title
    del df['review_title']

    # add column has_review_text (0 = only rating, no text; 1 = rating + text)
    df['has_review_text'] = np.where(df['review_text'].isnull(), 0, 1)

    # add column that states the origin of the comment (datasource)
    df['review_origin'] = 'bga'

    # rename columns:
    df.rename(columns={
        'username': '******',
        'date': 'review_date'
    },
              inplace=True)

    export_df_to_csv(
        df,
        '../Data/BoardGameAtlas/Processed/API/bga_all_reviews_for_games_with_more_than_2_reviews_CLEANED.csv'
    )

Exemplo n.º 9

0

Exibir arquivo

def create_json_with_games_that_fulfill_ratings_amount():
    '''
    Creates a small json that only includes information on games with a certain amount of user_ratings.
    Information is GameID, Name and num_user_ratings.
    '''
    filename = '../Data/BoardGameAtlas/Processed/API/bga_all_games_ids_names_numRatings.json'
    df = import_json_to_dataframe(filename)

    # cut of games that have less ratings than 3:
    # results in 8246 games and 162,045 reviews for these games. This is equal to 91.26% of all user ratings.
    max_amount_ratings = 999999
    min_amount_ratings = 3
    df = df[df.num_user_ratings >= min_amount_ratings].reset_index(
        drop=True).sort_values(by=['num_user_ratings'], ascending=False)
    df = df[df.num_user_ratings <= max_amount_ratings].reset_index(
        drop=True).sort_values(by=['num_user_ratings'], ascending=False)

    # export games:
    filename_export = '../Data/BoardGameAtlas/Processed/API/bga_games_with_more_or_equal_3_reviews.json'
    export_df_to_json(df, filename_export)

Exemplo n.º 10

0

Exibir arquivo

def match_game_names():
    """
    This function matches bga and bgg boardgames based on their game names and the year in which they were published.
    This is how it works:
    - We calculate n-grams with n=3 for each boardgamename.
    - By removing stopwords that appear in many games that don't add much meaning to the game title we can
    reduce the number of false-positives and false-negatives.
    Examples: the stopwords 'board' and 'game' are removed:
        bga_name = '7 Wonders'
        bgg_name = '7 Wonders - The Board Game'
        -> this would result in a rather low jaccard score without removing the stopwords.

        bga_name = 'Settlers - The Board Game'
        bgg_name = '7 Wonder - The Board Game'
        -> this would result in a rather high jaccard score considering that both do not refer to the same game.
    - We then compare the similarity of a bga candidate and a bgg candidate by calculating the jaccard similarity.
    - The candidate with the highest jaccard score is chosen. Only if the jaccard score of that candidate exceeds our
    threshold the games are matched.

    Scalability Challenge:
    - However, there is one issue with that strategy: Computing the jaccard similarity requires comparisons of
    ca. 8,000 bga games and ca. 19,000 bgg games [ O(n) = n^2 ]. Comparing all bga_games and all bgg_games
    would lead to an extremely long run time, which we want to avoid.
    -> 8,000 x 19,000 = 152,000,000 comparisons.

    Therefore we adjusted our approach:
    1) First, we find games that can be matched exactly. By this we mean games that have exactly the same name in both
     datasets. Since there are some games with duplicate game names that do not refer to the same game, we also include
     the year of publication. Therefore only games with exactly the same name and exactly the same year of publication
     are matched in this step. We can then subtract these games from the their datasets to decrease
    the sizes of games that have to be compared to: ca. 3,000 bga games and ca. 15,000 bgg games.
    -> 3,000 x 15,000 = 45,000,000 (complexity reduced by ~70%)
    2) This is still quite a lot of comparisons. However, we made another observation. We also tried matching games
    by only their game_name (not also taking the year_published into consideration). In the set of games that could
    be matched exactly, in almost all cases the publish years are the same, which makes sense obviously.
    3) Therefore we can further reduce complexity by grouping by publish years and comparing only games that have
    the same publish year. To make sure we don't lose games because the publish years deviate by one year, we also
    compare to games published in the years one year before and after.
    This further reduces the number of comparisons to: ~ 1,000,000
    Hence, by applying the similarity function only to the most promising pairs we reduced the number of required
    comparisons by 98%.
    """

    # Import bgg and bga data:
    bgg_filename = get_latest_version_of_file(
        '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv'
    )
    bgg_df = pd.read_csv(bgg_filename, index_col=0)
    bgg_names = bgg_df['name'].tolist()

    bga_filename = get_latest_version_of_file(
        '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json')
    bga_df = import_json_to_dataframe(bga_filename)
    bga_names = bga_df['name'].tolist()

    # Create lists with bga and bgg ids:
    bgg_ids = bgg_df['bgg_game_id'].tolist()
    bga_ids = bga_df['bga_game_id'].tolist()

    # Check duplicate names:
    bgg_duplicate_names = set([x for x in bgg_names if bgg_names.count(x) > 1])
    bga_duplicate_names = set([x for x in bga_names if bga_names.count(x) > 1])

    ## find exact matches (game_name, year_published):
    exact_matches_join_df = pd.merge(left=bgg_df,
                                     right=bga_df,
                                     left_on=['name', 'year_published'],
                                     right_on=['name', 'year_published'])

    # create list of ids of exactly matched games:
    exact_matches_bgg_ids = exact_matches_join_df['bgg_game_id'].tolist()
    exact_matches_bga_ids = exact_matches_join_df['bga_game_id'].tolist()

    # subtract exact matches from datasets to reduce their size:
    subset_bgg_df = bgg_df[~bgg_df['bgg_game_id'].isin(exact_matches_bgg_ids)]
    subset_bga_df = bga_df[~bga_df['bga_game_id'].isin(exact_matches_bga_ids)]
    subset_bgg_df.rename(columns={'year_published': 'year_published_bgg'},
                         inplace=True)
    subset_bga_df.rename(columns={'year_published': 'year_published_bga'},
                         inplace=True)

    ## In the next part we now want to apply name matching. Our first task is to find candidates so that we don't
    ## have to compare all games from one dataset with all games from the other dataset. We do so by grouping by
    ## their year of publication.
    ## First, we need some preprocessing steps so that we can actually set up our candidates:

    # Extract years from bga dataset:
    # A lot of type casting due to unexpected errors with float and set
    all_years = subset_bga_df['year_published_bga'].dropna().tolist()
    all_years = list(map(int, all_years))
    years = list(set(all_years))
    years.sort(reverse=True)

    # Do not apply name matching to games where to publish_year is missing:
    print('Dropped ' + str(subset_bgg_df['year_published_bgg'].isna().sum()) +
          ' rows from bga_dataset from name_matching')
    print('Dropped ' + str(subset_bga_df['year_published_bga'].isna().sum()) +
          ' rows from bgg_dataset from name_matching')
    subset_bgg_df.dropna(inplace=True)
    subset_bga_df.dropna(inplace=True)

    # strip of '.0' at the end of each year by converting to int: 2018.0 -> 2018
    subset_bga_df["year_published_bga"] = subset_bga_df[
        "year_published_bga"].astype(int)

    # create a dictionary to group all bgg games by their year of publication
    # during the name matching process we will only compare the names of games with the same publication year
    bgg_dic_grouped_by_year = {}
    bga_dic_grouped_by_year = {}

    # fill the previously created dictionaries that include all the games that were published in a certain year
    for year in years:
        bgg_dic_grouped_by_year[year] = subset_bgg_df[
            subset_bgg_df['year_published_bgg'] == year].to_dict('records')
        bga_dic_grouped_by_year[year] = subset_bga_df[
            subset_bga_df['year_published_bga'] == year].to_dict('records')

    ## Now we get to the interesting part:
    ## We iterate over all bga_games which we found no exact bgg_matches for. We then create a list with potential
    ## candidates including all bgg_games that were published in the same year or one year before or after.
    ## For these candidates we then apply name_matching using the jaccard similarity.
    for year in years:
        for bga_game in bga_dic_grouped_by_year[year]:
            input_string = bga_game['name']

            candidate_list = []
            # create candidate_list with all bgg games that were published in the same year as the bga_game:
            for bgg_game in bgg_dic_grouped_by_year[year]:
                candidate_list.append(bgg_game['name'])

            # also check bgg games that were published in the previous year and one year later:
            if year + 1 in bgg_dic_grouped_by_year:
                for bgg_game in bgg_dic_grouped_by_year[year + 1]:
                    candidate_list.append(bgg_game['name'])
            if year - 1 in bgg_dic_grouped_by_year:
                for bgg_game in bgg_dic_grouped_by_year[year - 1]:
                    candidate_list.append(bgg_game['name'])

            # Try to match the input_string (target BGA Game name) one of the games in the candidate_list (bgg games).
            # The match with the highest jaccard similarity is returned. If there is no match, or the Jaccard threshold
            # can not be exceeded then an empty string is returned.
            match = find_match(input_string, candidate_list,
                               JACCARD_THRESHOLD_GAME_NAME)
            bga_game['match'] = match['name']
            bga_game['jaccard_score'] = match['jaccard_score']

    global COMPARISONS
    print('Number of comparisons: ' + str(COMPARISONS))

    bga_list_matches = []
    for year in years:
        for bga_game in bga_dic_grouped_by_year[year]:
            bga_list_matches.append(bga_game)

    # turn list of dictionaries back to data frame:
    jaccard_matches_df = pd.DataFrame(bga_list_matches)

    # just for debugging and inspecting results:
    analyse_df = pd.DataFrame(bga_list_matches)
    analyse_df = analyse_df[analyse_df['jaccard_score'] != '']
    analyse_df = analyse_df[['name', 'match', 'jaccard_score']]
    analyse_df = analyse_df.sort_values('jaccard_score', ascending=False)

    ## We have now succesfully found a large number of games that could be matched. All that's left to do is
    #  creating a dataframe that contains the matched BGA and BGG IDs. We do so in three steps:
    # 1) Prepare DF containing BGA and BGG IDs of games that could be matched exactly by name and year_published
    # 2) Prepare DF containing BGA and BGG IDs of games that could be matched by string matching (jaccard method)
    # 3) Concatenate both data frames

    # 1) Exact matches
    # Keep only ID columns:
    exact_matches_join_df = exact_matches_join_df[[
        'bgg_game_id', 'bga_game_id'
    ]]

    # 2) Jaccard matches
    # Cut of rows where the jaccard threshold wasn't reached (-> no match)
    jaccard_matches_df = jaccard_matches_df[jaccard_matches_df['match'] != '']
    jaccard_matches_df = jaccard_matches_df[[
        'bga_game_id', 'name', 'year_published_bga', 'match', 'jaccard_score'
    ]]
    jaccard_matches_df.rename(columns={'name': 'bga_name'}, inplace=True)

    # Join both datasets
    jaccard_matches_join_df = pd.merge(
        left=bgg_df[['bgg_game_id', 'name', 'year_published']],
        right=jaccard_matches_df,
        left_on=['name', 'year_published'],
        right_on=['match', 'year_published_bga'])
    jaccard_matches_join_df = jaccard_matches_join_df[[
        'bgg_game_id', 'bga_game_id'
    ]]

    # 3) Concat both dfs
    matched_game_ids_df = pd.concat(
        [exact_matches_join_df, jaccard_matches_join_df])

    # 4) Store matches to csv:
    export_df_to_csv(
        matched_game_ids_df,
        '../Data/Joined/Integration/GameInformation/matched_bga_and_bgg_ids.csv'
    )

Exemplo n.º 11

0

Exibir arquivo

def merge_game_information():
    '''
    Function merges the boardgames of the previously matched games.

    For the matched games there are four types of columns:
        a) columns that exist in both datasets but we only need to keep one of them (can include conflicting values):
            (e.g. name, year_published, min_players, ...)
            In this case we chose to keep the bgg columns! ["trust-your-friends" avoidance strategy as in case of
            contradicting values we keep values based on which data source they come from]
        b) columns that exist in both datasets but we want to keep both:
            (e.g. bga_game_id/bgg_game_id, num_user_ratings, average_user_rating, bga_rank/bgg_rank, ...)
        c) columns that exist only in the bgg dataset:
            (e.g. num_user_comments, bgg_average_weight, ...)
        d) columns that exist only in the bga dataset:
            (e.g. reddit_all_time_count, bga_game_url, ...)
    '''
    # import data:
    # bgg game information dataset:
    bgg_filename = get_latest_version_of_file(
        '../Data/BoardGameGeeks/Processed/GameInformation/01_BGG_Game_Information_*.csv'
    )
    bgg_df = pd.read_csv(bgg_filename, index_col=0)
    # bga game information dataset:
    bga_filename = get_latest_version_of_file(
        '../Data/BoardGameAtlas/Processed/API/01_BGA_Game_Information_*.json')
    bga_df = import_json_to_dataframe(bga_filename)

    # 1) this leaves us with three groups:
    # a) Matched Games
    # b) BGG Games that could not be matched
    # c) BGA Games that could not be matched

    # 1a) matched games:
    ids_matched_games_df = pd.read_csv(
        '../Data/Joined/Integration/GameInformation/matched_bga_and_bgg_ids.csv',
        index_col=0)
    bgg_subset_matches = bgg_df[bgg_df['bgg_game_id'].isin(
        ids_matched_games_df['bgg_game_id'])]
    bga_subset_matches = bga_df[bga_df['bga_game_id'].isin(
        ids_matched_games_df['bga_game_id'])]
    # 1b) BGG games no matched:
    bgg_subset_no_matches = bgg_df[~bgg_df['bgg_game_id'].
                                   isin(ids_matched_games_df['bgg_game_id'])]
    # 1c) BGA games no matched:
    bga_subset_no_matches = bga_df[~bga_df['bga_game_id'].
                                   isin(ids_matched_games_df['bga_game_id'])]

    # 2)
    # For the matched games there are three types of columns:
    #   a) columns that exist in both datasets but we only need to keep one of them:
    #       (e.g. name, year_published, min_players, ...)
    #       In this case we chose to keep the bgg columns! It doesn't really matter which ones you keep though!
    #   b) columns that exist in both datasets but we want to keep both:
    #       (e.g. bga_game_id/bgg_game_id, num_user_ratings, average_user_rating, bga_rank/bgg_rank, ...)
    #   c) columns that exist only in the bgg dataset:
    #       (e.g. num_user_comments, bgg_average_weight, ...)
    #   d) columns that exist only in the bga dataset:
    #       (e.g. reddit_all_time_count, bga_game_url, ...)

    # 2a) drop columns from bga dataset:
    drop_bga_columns = [
        'name', 'year_published', 'min_players', 'max_players', 'min_playtime',
        'max_playtime', 'min_age', 'game_description', 'image_url',
        'thumbnail_url'
    ]
    bga_subset_matches.drop(columns=drop_bga_columns, inplace=True)

    # add 'matched_bgg_id' column:
    bga_subset_matches = pd.merge(left=bga_subset_matches,
                                  right=ids_matched_games_df,
                                  left_on='bga_game_id',
                                  right_on='bga_game_id')

    # merge both datasets:
    matched_games_df = pd.merge(left=bgg_subset_matches,
                                right=bga_subset_matches,
                                left_on=['bgg_game_id'],
                                right_on=['bgg_game_id'])

    # Handle duplicate ids in matched_games_df:
    # remove duplicates:
    # duplicate bgg_ids:
    matched_games_df.drop_duplicates(subset=['bgg_game_id'],
                                     keep='first',
                                     inplace=True)
    # duplicate bga_ids:
    matched_games_df.drop_duplicates(subset=['bga_game_id'],
                                     keep='first',
                                     inplace=True)

    # In a last (union) step we now have to concatenate all three dataframes to one big dataframes:
    games_df = matched_games_df.append(
        [bgg_subset_no_matches, bga_subset_no_matches],
        ignore_index=True,
        sort=False)

    # reorder columns:
    cols_to_order = [
        'name', 'bgg_game_id', 'bga_game_id', 'year_published', 'min_players',
        'max_players', 'min_playtime', 'max_playtime', 'min_age',
        'bgg_average_user_rating', 'bga_average_user_rating',
        'bgg_num_user_ratings', 'bga_num_user_ratings'
    ]
    new_columns = cols_to_order + (
        games_df.columns.drop(cols_to_order).tolist())
    games_df = games_df[new_columns]

    # create new unique key_column:
    games_df.insert(0, 'game_key', range(100001, 100001 + len(games_df)))

    # create key_csv that contains bga_game_id, bgg_game_id and game_key:
    key_df = games_df[['game_key', 'bga_game_id', 'bgg_game_id']]

    # check if there are any duplicates in game_df:
    games_df_duplicates = len(games_df) - len(games_df.drop_duplicates())

    if games_df_duplicates > 0:
        print('Warning. ' + str(games_df_duplicates) +
              ' duplicates found in BoardGameTable: ')
        games_df.drop_duplicates(inplace=True)
        print('Duplicates removed!')

    # check if there are any duplicates in key_df:
    count_duplicates_bgg = len(key_df[~key_df['bgg_game_id'].isnull()]) - len(
        key_df[~key_df['bgg_game_id'].isnull()].drop_duplicates(
            subset='bgg_game_id'))
    count_duplicates_bga = len(key_df[~key_df['bga_game_id'].isnull()]) - len(
        key_df[~key_df['bga_game_id'].isnull()].drop_duplicates(
            subset='bga_game_id'))

    if (count_duplicates_bga + count_duplicates_bga) > 0:
        print('Warning. Duplicates found: ')
        print('BGG_game_ids: ' + str(count_duplicates_bgg))
        print('BGA_game_ids: ' + str(count_duplicates_bga))
        key_df.drop_duplicates(inplace=True)
        print('Duplicates removed:')

    # Fix badly encoded symbols
    # Insert quotation marks
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&quot;', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&rdquo;', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&rsquo;', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&ldquo;', '\'')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&amp;', '&')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&eacute;', 'e')

    # Insert Umlaute
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&auml;', 'ä')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&Uuml;', 'ü')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&uuml;', 'ü')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&ouml;', 'ö')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&szlig;', 'ß')

    # Insert dashes & non-breaking space
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&ndash;', '-')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&mdash;', '-')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&nbsp;', ' ')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&times;', 'x')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&shy;', '-')

    # Kick html characters
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&#...;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&#..;', ' ')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'&#.;', '')

    games_df['game_description'] = games_df['game_description'].str.replace(
        r'.....;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'....;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'...;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'..;', '')
    games_df['game_description'] = games_df['game_description'].str.replace(
        r'.;', '')

    # Remove double semicolon and double spaces
    games_df['game_description'] = games_df['game_description'].str.replace(
        r';;', ' ')
    games_df['game_description'] = games_df['game_description'].str.replace(
        ' +', ' ')
    games_df['game_description'] = games_df['game_description'].str.strip()

    # export to csv:
    export_df_to_csv(games_df, '../Data/Joined/Results/BoardGames.csv')
    export_df_to_csv(
        key_df,
        '../Data/Joined/Integration/GameKeys/Keys_All_Games_Integrated.csv')