예제 #1
0
class AreaEvents():
    def __init__(self, oltp_uri, stagedb_uri):
        self._db = DatabaseHelper(oltp_uri)
        self._dw = DatabaseHelper(stagedb_uri)

    def etl(self):
        try:
            df = self.__extract()
            transformed_df = self.__transform(df)
            self.__load(transformed_df)
            print('data loaded successfully')
        except Exception as e:
            print('error occured !!' + str(e))

    def __extract(self):
        return self._db.extract_table_to_pandas("musicbrainz.l_area_event",
                                                columns="id, entity0, entity1")

    def __transform(self, df):
        df["area_id"] = df["entity0"]
        df["event_id"] = df["entity1"]
        df = df.drop('entity0', 1)
        df = df.drop('entity1', 1)
        return df.sort_values(by=['id'], ascending=True)

    def __load(self, transformed_df):
        self._dw.load_df_into_dwh(transformed_df, "dim.area_events", "public")
예제 #2
0
class Genders():
    def __init__(self, oltp_uri, stagedb_uri):
        self._db = DatabaseHelper(oltp_uri)
        self._dw = DatabaseHelper(stagedb_uri)

    def etl(self):
        try:
            df = self.__extract()
            transformed_df = self.__transform(df)
            self.__load(transformed_df)
            print('data loaded successfully')
        except Exception as e:
            print('error occured !!' + str(e))

    def __extract(self):
        return self._db.extract_table_to_pandas(
            "musicbrainz.gender", columns="id, name, parent, child_order")

    def __transform(self, df):
        top_row = pd.DataFrame({
            'id': [0],
            'name': ['Unknown'],
            'parent': [None],
            'child_order': [0]
        })
        df = pd.concat([top_row, df]).reset_index(
            drop=True)  #Trick to add row to the line of base data framw
        return df.sort_values(by=['id'], ascending=True)

    def __load(self, transformed_df):
        self._dw.load_df_into_dwh(transformed_df, "dim.genders", "public")
예제 #3
0
class Places():
    def __init__(self, oltp_uri, stagedb_uri):
        self._db = DatabaseHelper(oltp_uri)
        self._dw = DatabaseHelper(stagedb_uri)

    def etl(self):
        try:
            df = self.__extract()
            transformed_df = self.__transform(df)
            self.__load(transformed_df)
            print('data loaded successfully')
        except Exception as e:
            print('error occured !!' + str(e))

    def __extract(self):
        return self._db.extract_table_to_pandas(
            "musicbrainz.place", columns="id, gid, name, type, area, ended")

    def __transform(self, df):
        df = df[df["gid"] != None]
        df['type'] = df.type.fillna(0)
        df['area'] = df.type.fillna(0)
        df["is_active"] = df["ended"]
        df = df.drop('gid', 1)
        return df.sort_values(by=['id'], ascending=True)

    def __load(self, transformed_df):
        self._dw.load_df_into_dwh(transformed_df, "dim.places", "public")

    def __create_index(self):
        self._dw.run_command(
            "CREATE INDEX index_name ON table_name (column_name);")
예제 #4
0
class Artists():
    def __init__(self, oltp_uri, stagedb_uri):       
        self._db = DatabaseHelper(oltp_uri)
        self._dw = DatabaseHelper(stagedb_uri)
        
    def etl(self):
        try:
            df = self.__extract()
            transformed_df = self.__transform(df)
            self.__load(transformed_df)
            print('data loaded successfully')
        except Exception as e:
            print('error occured !!' + str(e))
    
    def __extract(self):
        return self._db.extract_table_to_pandas("musicbrainz.artist", columns="id, gid, name, sort_name, type, area, gender, ended")
    
    def __transform(self, df):
        df = df[df["gid"] != None]
        df['type'] = df.type.fillna(0)
        df['area'] = df.type.fillna(0)
        df['gender'] = df.type.fillna(0)
        df["is_active"] =  df.ended.apply(lambda x : not(x))
        df = df.drop('ended', 1)
        return df.sort_values(by=['id'], ascending=True)

    def __load(self, transformed_df):
        self._dw.load_df_into_dwh(transformed_df, "dim.artists", "public")
예제 #5
0
class LinkTypes():
    def __init__(self, oltp_uri, stagedb_uri):
        self._db = DatabaseHelper(oltp_uri)
        self._dw = DatabaseHelper(stagedb_uri)

    def etl(self):
        try:
            df = self.__extract()
            transformed_df = self.__transform(df)
            self.__load(transformed_df)
            print('data loaded successfully')
        except Exception as e:
            print('error occured !!' + str(e))

    def __extract(self):
        return self._db.extract_table_to_pandas(
            "musicbrainz.link_type", columns="id, parent, name, child_order")

    def __transform(self, df):
        return df.sort_values(by=['id'], ascending=True)

    def __load(self, transformed_df):
        self._dw.load_df_into_dwh(transformed_df, "dim.link_types", "public")
예제 #6
0
class Events():
    def __init__(self, oltp_uri, stagedb_uri):
        self._db = DatabaseHelper(oltp_uri)
        self._dw = DatabaseHelper(stagedb_uri)

    def etl(self):
        try:
            df = self.__extract()
            transformed_df = self.__transform(df)
            return self.__load(transformed_df)
            print('data loaded successfully')
        except Exception as e:
            print('error occured !!' + str(e))

    def __extract(self):
        return self._db.extract_table_to_pandas(
            "musicbrainz.event",
            columns=
            "id, gid, begin_date_year, begin_date_month, begin_date_day, end_date_year, end_date_month, end_date_day, time, type, cancelled, ended"
        )

    def __transform(self, df):
        df = df[df["gid"] != None]
        df['begin_date_year'] = df.begin_date_year.fillna(0)
        df['end_date_year'] = df.end_date_year.fillna(0)
        df['begin_date_month'] = df.begin_date_month.fillna(0)
        df['end_date_month'] = df.end_date_month.fillna(0)

        df = df[df["begin_date_year"] != 0]
        df = df[df["end_date_year"] != 0]
        df = df[df["begin_date_month"] != 0]
        df = df[df["end_date_month"] != 0]

        df['begin_date_day'] = df.begin_date_day.fillna(1)
        df['end_date_day'] = df.end_date_day.fillna(1)

        df['begin_date_tmp'] = df['begin_date_year'].astype('int64').astype(
            str) + '/' + df['begin_date_month'].astype('int64').astype(
                str) + '/' + df['begin_date_day'].astype('int64').astype(str)
        df['begin_date'] = df['begin_date_tmp'].apply(lambda x: parse(x))
        df['begin_date_id'] = df['begin_date'].apply(
            lambda x: x.strftime('%Y%m%d'))

        df['end_date_tmp'] = df['end_date_year'].astype('int64').astype(
            str) + '/' + df['end_date_month'].astype('int64').astype(
                str) + '/' + df['end_date_day'].astype('int64').astype(str)
        df['end_date'] = df['end_date_tmp'].apply(lambda x: parse(x))
        df['end_date_id'] = df['end_date'].apply(
            lambda x: x.strftime('%Y%m%d'))

        df['duration'] = df['end_date'] - df['begin_date']  #).dt.days

        df = df.drop('begin_date_year', 1)
        df = df.drop('begin_date_month', 1)
        df = df.drop('begin_date_day', 1)
        df = df.drop('begin_date_tmp', 1)
        df = df.drop('begin_date', 1)

        df = df.drop('end_date_year', 1)
        df = df.drop('end_date_month', 1)
        df = df.drop('end_date_day', 1)
        df = df.drop('end_date_tmp', 1)
        df = df.drop('end_date', 1)

        df.begin_date_id = df.begin_date_id.astype('int64')
        df.end_date_id = df.end_date_id.astype('int64')

        return df.sort_values(by=['id'], ascending=True)

    def __load(self, transformed_df):
        self._dw.load_df_into_dwh(transformed_df, "dim.events", "public")