Exemplo n.º 1
0
    def __init__(self):
        self.sourceConnection = SourceConnection()
        self.targetConnection = TargetConnection()

        self.imdbConnection = http.client.HTTPSConnection(
            "imdb8.p.rapidapi.com")
        self.imdbHeaders = {
            'x-rapidapi-host': "imdb8.p.rapidapi.com",
            'x-rapidapi-key':
            "fee7fa13a7mshd61881af4799557p172871jsndb85a36967f3"
        }

        self.categories = []

        self.availableEndpoints = {
            "Action": "%252Fchart%252Fpopular%252Fgenre%252Faction",
            "Animation": "%252Fchart%252Fpopular%252Fgenre%252Fanimation",
            "Comedy": "%252Fchart%252Fpopular%252Fgenre%252Fcomedy",
            "Documentary": "%252Fchart%252Fpopular%252Fgenre%252Fdocumentary",
            "Drama": "%252Fchart%252Fpopular%252Fgenre%252Fdrama",
            "Family": "%252Fchart%252Fpopular%252Fgenre%252Ffamily",
            "Horror": "%252Fchart%252Fpopular%252Fgenre%252Fhorror",
            "Music": "%252Fchart%252Fpopular%252Fgenre%252Fmusic",
            "Sci-Fi": "%252Fchart%252Fpopular%252Fgenre%252Fsci_fi",
            "Sport": "%252Fchart%252Fpopular%252Fgenre%252Fsport",
        }
Exemplo n.º 2
0
class ETL2:
    sourceConnection: None
    targetConnection: None

    def __init__(self):
        self.sourceConnection = SourceConnection()
        self.targetConnection = TargetConnection()

    def startETL2(self):
        self.startTemporalDatabases()
        self.startTransformationsAndLoads()

    def startTemporalDatabases(self):
        query = """
        CREATE TEMPORARY TABLE rents_by_category_country_and_year
        (SELECT COUNT(rental.rental_date) as rental, YEAR(rental.rental_date)as year, category.name as category, country.country as country
            FROM rental 
            LEFT JOIN inventory ON(rental.inventory_id = inventory.inventory_id)
            LEFT JOIN film ON (inventory.film_id = film.film_id)
            LEFT JOIN film_category ON (film.film_id = film_category.film_id)
            LEFT JOIN category ON (film_category.category_id = category.category_id)
            LEFT JOIN customer ON (rental.customer_id = customer.customer_id)
            LEFT JOIN address ON ( customer.address_id = address.address_id )
            LEFT JOIN city ON (address.city_id = city.city_id)
            LEFT JOIN country ON (city.country_id = country.country_id)
            GROUP BY country, category, year ORDER BY country ASC, year DESC, rental DESC);"""

        self.sourceConnection.runQuery(query)

    def startTransformationsAndLoads(self):
        query = "SELECT COUNT(*) FROM rents_by_category_country_and_year;"

        results = self.sourceConnection.runQuery(query)
        row_count = results[0][0]
        rounds = int(row_count / 1000) + 1

        for i in range(0, rounds):
            query = "SELECT MAX(rental) as rental, country, year FROM rents_by_category_country_and_year GROUP BY year, country ORDER BY country ASC, year DESC"
            query += " LIMIT " + str(i * 1000) + ", 1000"

            results = self.sourceConnection.runQuery(query)

            for j in results:
                query_category = "SELECT category FROM rents_by_category_country_and_year WHERE year = " + str(
                    j[2]
                ) + " AND country = '" + j[1] + "' AND rental = " + str(
                    j[0]) + ";"

                categories = self.sourceConnection.runQuery(query_category)

                for category in categories:
                    query_insert = "INSERT INTO rents_by_category_country_and_year (rental, year, category, country) VALUES (" + str(
                        j[0]) + ", " + str(
                            j[2]) + ", '" + category[0] + "', '" + j[1] + "' )"

                    self.targetConnection.runQuery(query_insert)
                    self.targetConnection.commitChanges()
Exemplo n.º 3
0
class ETL3:
    sourceConnection: None
    targetConnection: None

    def __init__(self):
        self.sourceConnection = SourceConnection()
        self.targetConnection = TargetConnection()
    
    def startETL3(self):
        self.startTransformationsAndLoads()
    
    def startTransformationsAndLoads(self):
        query_count = "SELECT COUNT(*) FROM rental WHERE return_date IS NULL;"
        
        count = self.sourceConnection.runQuery(query_count)
        row_count = count[0][0]
        rounds = int(row_count/1000) + 1

        for i in range(0, rounds):
            query_costs = """
            SELECT COUNT(rental.rental_id) as total_copies_lost, film.title, SUM(payment.amount) as total_rental_lost, SUM(film.replacement_cost) as total_replacement_cost
                FROM rental 
                LEFT JOIN inventory ON(rental.inventory_id = inventory.inventory_id)
                LEFT JOIN film ON(inventory.film_id = film.film_id)
                LEFT JOIN payment ON(rental.rental_id = payment.payment_id)
                WHERE return_date IS NULL
                GROUP BY film.title, film.special_features ORDER BY total_copies_lost DESC, total_replacement_cost DESC;"""
            
            costs = self.sourceConnection.runQuery(query_costs)

            for cost in costs:
                query_insert = "INSERT INTO money_lost_on_rentals (total_copies_lost, title, total_rental_lost, total_replacement_cost) VALUES "
                query_insert += "(" + str(cost[0]) + ", '" + cost[1] + "', " + str(cost[2]) + ", " + str(cost[3]) + ")"

                self.targetConnection.runQuery(query_insert)

                self.targetConnection.commitChanges()
Exemplo n.º 4
0
 def __init__(self):
     self.sourceConnection = SourceConnection()
     self.targetConnection = TargetConnection()
Exemplo n.º 5
0
class ETL4:
    sourceConnection: None
    targetConnection: None
    imdbConnection: None
    imdbHeaders: None
    availableEndpoints: {}
    categories: []

    def __init__(self):
        self.sourceConnection = SourceConnection()
        self.targetConnection = TargetConnection()

        self.imdbConnection = http.client.HTTPSConnection(
            "imdb8.p.rapidapi.com")
        self.imdbHeaders = {
            'x-rapidapi-host': "imdb8.p.rapidapi.com",
            'x-rapidapi-key':
            "fee7fa13a7mshd61881af4799557p172871jsndb85a36967f3"
        }

        self.categories = []

        self.availableEndpoints = {
            "Action": "%252Fchart%252Fpopular%252Fgenre%252Faction",
            "Animation": "%252Fchart%252Fpopular%252Fgenre%252Fanimation",
            "Comedy": "%252Fchart%252Fpopular%252Fgenre%252Fcomedy",
            "Documentary": "%252Fchart%252Fpopular%252Fgenre%252Fdocumentary",
            "Drama": "%252Fchart%252Fpopular%252Fgenre%252Fdrama",
            "Family": "%252Fchart%252Fpopular%252Fgenre%252Ffamily",
            "Horror": "%252Fchart%252Fpopular%252Fgenre%252Fhorror",
            "Music": "%252Fchart%252Fpopular%252Fgenre%252Fmusic",
            "Sci-Fi": "%252Fchart%252Fpopular%252Fgenre%252Fsci_fi",
            "Sport": "%252Fchart%252Fpopular%252Fgenre%252Fsport",
        }

    def startETL4(self):
        rows_upload: []
        self.loadCategories()

        self.loadTemporaryTable()

        for category in self.categories:
            top5Sakila = self.getTop5PerCategorySakila(category)
            top5IMDb = self.getTop5PerCategoryIMDb(category)

            for i in range(len(top5Sakila)):
                query_insert = (
                    "INSERT INTO top_5_popularity_comparison " +
                    "(sakila_film_title, sakila_film_popularity, sakila_category, imdb_film_title, imdb_film_popularity, imdb_film_category) VALUES ("
                    + "'" + top5Sakila[i][0] + "', " + "" +
                    str(top5Sakila[i][1]) + "," + "'" + (top5Sakila[i][2]) +
                    "'," + "'" + (top5IMDb[i][0]) + "'," + "" +
                    str(top5IMDb[i][1]) + "," + "'" + (top5IMDb[i][2]) + "'" +
                    ");")

                self.targetConnection.runQuery(query_insert)
                self.targetConnection.commitChanges()

    def getPopularMoviesByCategoryIMDb(self, category):
        self.imdbConnection.request(
            "GET",
            "/title/get-popular-movies-by-genre?genre=" + category,
            headers=self.imdbHeaders)

        res = self.imdbConnection.getresponse()
        data = res.read()

        movies = json.loads(data.decode("utf-8"))

        return movies

    def getMovieDetailsIMDb(self, name):
        self.imdbConnection.request("GET",
                                    "/title/get-details?tconst=" + name,
                                    headers=self.imdbHeaders)

        res = self.imdbConnection.getresponse()
        data = res.read()

        movie = json.loads(data.decode("utf-8"))

        return movie

    def getRatingsPerMovieIMDb(self, name):
        self.imdbConnection.request("GET",
                                    "/title/get-ratings?tconst=" + name,
                                    headers=self.imdbHeaders)

        res = self.imdbConnection.getresponse()
        data = res.read()

        movie = json.loads(data.decode("utf-8"))

        return movie

    def loadCategories(self):
        query_categories = "SELECT name FROM category"

        categories = self.sourceConnection.runQuery(query_categories)

        for category in categories:
            self.categories.append(category[0])

    def loadTemporaryTable(self):
        query_temporary = """ 
        CREATE TEMPORARY TABLE top_5_popularity_comparison
            (SELECT film.title, film.rental_rate, category.name as category
                FROM film 
                LEFT JOIN film_category ON (film.film_id = film_category.film_id)
                LEFT JOIN category ON (film_category.category_id = category.category_id)
                
                ORDER BY name ASC, rental_rate DESC);
        """

        self.sourceConnection.runQuery(query_temporary)

    def getTop5PerCategorySakila(self, category):
        query_top = "SELECT * FROM top_5_popularity_comparison WHERE category = '" + category + "' LIMIT 0, 5;"

        top5 = self.sourceConnection.runQuery(query_top)

        return top5

    def getTop5PerCategorySakila(self, category):
        query_top = "SELECT * FROM top_5_popularity_comparison WHERE category = '" + category + "' LIMIT 0, 5;"

        top5 = self.sourceConnection.runQuery(query_top)

        return top5

    def getTop5PerCategoryIMDb(self, category):
        if category in self.availableEndpoints.keys():
            top5IMDb = []
            movies = self.getPopularMoviesByCategoryIMDb(
                self.availableEndpoints[category])

            for i in range(0, 5):
                movie_id = movies[i].split("/")

                movie = self.getMovieDetailsIMDb(movie_id[2])
                rating = self.getRatingsPerMovieIMDb(movie_id[2])

                title = movie['title'] if 'title' in rating.keys() else 'NULL'
                rate = (int(rating['rating']) /
                        2) if 'rating' in rating.keys() else 'NULL'

                movie_row = (title, rate, category)

                top5IMDb.append(movie_row)

            return top5IMDb
        return [('NULL', 'NULL', category), ('NULL', 'NULL', category),
                ('NULL', 'NULL', category), ('NULL', 'NULL', category),
                ('NULL', 'NULL', category)]