def extract_data():
    # load id data
    file = open("movie.txt", "r")
    id_list = file.readlines()
    for i in range(len(id_list)):
        id_list[i] = id_list[i].strip("\n")

    # setup
    logger = Logger("file").getLogger()

    with MyDb("localhost", "root", "", 3306) as db, DoubanAPI() as D:
        # create database "movie_info", comment this block if database already created
        try:
            create_db(db)
        except Exception as e:
            logger.error(e)
            raise Exception("Database creation failed!")

        # connect to "movie_info"
        db.connect_to_db("movie_info")

        # begin extraction and insertion
        for id in id_list:
            time.sleep(3)  # simulate user behaviour
            # extraction
            try:
                D.search(id)
                logger.info("Search success for id: " + str(id))
            except Exception as e:  # error caused by ip ban
                logger.error(e)
                raise Exception("Ip banned!")
            try:
                info = d.info()
            except Exception as e:  # error caused by ip ban
                logger.error(e)
                raise Exception("Ip banned!")

            # insert into entity 'movie'
            query = "INSERT INTO `movie` VALUES (%s, %s, %s, %s, %s)"
            param = (id, ",".join(info["上映日期"]), info["rating"],
                     info["rating_count"], ",".join(info["rating_weight"]))
            try:
                db.execute(query, param)
            except Exception as e:  # error caused by duplicate id in txt
                logger.error(e)
                logger.info("Failure in id: " + str(id))
                continue

            # insert into relations
            query = "INSERT INTO `director_movie` VALUES (%s, %s)"
            for d in info["director_id"]:
                try:
                    param = (d, id)
                    db.execute(query, param)
                except Exception as e:
                    logger.error(e)
                    continue
            query = "INSERT INTO `actor_movie` VALUES (%s, %s)"
            for a in info["actor_id"]:
                try:
                    param = (a, id)
                    db.execute(query, param)
                except Exception as e:
                    logger.error(e)
                    continue
            query = "INSERT INTO `writer_movie` VALUES (%s, %s)"
            for w in info["screenwriter_id"]:
                try:
                    param = (w, id)
                    db.execute(query, param)
                except Exception as e:
                    logger.error(e)
                    continue
            query = "INSERT INTO `actor_movie` VALUES (%s, %s)"
            for a in info["actor_id"]:
                try:
                    param = (a, id)
                    db.execute(query, param)
                except Exception as e:
                    logger.error(e)
                    continue
            query = "INSERT INTO `genre_movie` VALUES (%s, %s)"
            for g in info["类型"]:
                try:
                    param = (g, id)
                    db.execute(query, param)
                except Exception as e:
                    logger.error(e)
                    continue
            query = "INSERT INTO `region_movie` VALUES (%s, %s)"
            for r in info["制片国家/地区"]:
                try:
                    param = (r, id)
                    db.execute(query, param)
                except Exception as e:
                    logger.error(e)
                    continue
            query = "INSERT INTO `language_movie` VALUES (%s, %s)"
            for l in info["语言"]:
                try:
                    param = (l, id)
                    db.execute(query, param)
                except Exception as e:
                    logger.error(e)
                    continue
            # if success
            logger.info("Success in id: " + str(id))
        logger.info("Finish insertion")

        time.sleep(3000)  # wait until insertion finish successfully
        filter_data(db)
Exemplo n.º 2
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from logger_class import Logger

logger_object = Logger("var/log/class_logger.log")

logger_object.error("This is a error message")

Exemplo n.º 3
0
def region_info(db):
    header = ("region", "movie_count", "avg_rating")
    query = "SELECT region, COUNT(movie_ID) AS movie_count, AVG(rating) AS avg_rating FROM `region_movie` JOIN movie " \
            "WHERE movie_ID = ID GROUP BY region "
    db.execute_to_csv(query, None, "./data/region_avg.csv", header)
    header = ("movie_id", "region_rating")
    query = "SELECT ID, AVG(avg_r) AS avg_rating FROM movie JOIN region_movie JOIN (SELECT region, COUNT(movie_ID) AS " \
            "movie_count, AVG(rating) AS avg_r FROM `region_movie` JOIN movie WHERE movie_ID = ID GROUP BY region) AS " \
            "T WHERE ID = region_movie.movie_ID AND region_movie.region = T.region GROUP BY ID "
    db.execute_to_csv(query, None, "./data/region_pred_avg.csv", header)
    query = "SELECT ID, rating FROM `movie` WHERE ID NOT IN (SELECT movie_ID FROM region_movie)"
    db.execute_to_csv(query, None, "./data/region_empty.csv", header)


if __name__ == '__main__':
    logger = Logger("file").getLogger()
    make_dir()
    with MyDb("localhost", "root", "", 3306, "movie_info") as db:
        filter_data(db)
        try:
            get_movie(db)
            actor_info(db)
            director_info(db)
            writer_info(db)
            genre_info(db)
            language_info(db)
            region_info(db)
        except Exception as e:
            logger.error(e)