Exemplo n.º 1
0
def unite_imdb_profiles(verbose):
    """Unite all movie profiles in the IMDB profile directory."""
    if verbose:
        print("Uniting IMDB movie profiles to one csv file...")
    if not os.path.exists(_IMDB_DIR_PATH):
        print("No IMDB profiles to unite!")
        return
    profiles = []
    profile_files = os.listdir(_IMDB_DIR_PATH)
    if verbose:
        profile_files = tqdm(profile_files)
    for profile_file in profile_files:
        if verbose:
            profile_files.set_description('Reading {}'.format(profile_file))
        file_path = os.path.join(_IMDB_DIR_PATH, profile_file)
        _, ext = os.path.splitext(file_path)
        if ext == '.json':
            with open(file_path, 'r') as json_file:
                profiles.append(json.load(json_file))
    df = pd.DataFrame(profiles)
    df = _decompose_dict_column(df, 'avg_rating_per_demo', _DEMOGRAPHICS)
    df = _decompose_dict_column(df, 'votes_per_demo', _DEMOGRAPHICS)
    df = _decompose_dict_column(df, 'rating_freq',
                                [str(i) for i in range(1, 11)])
    df = _dummy_list_column(df, 'genres')
    unison_fpath = os.path.join(_get_dataset_dir_path(), 'imdb_dataset.csv')
    df.to_csv(unison_fpath, index=False)
Exemplo n.º 2
0
def build_united_profiles(verbose):
    """Build movie profiles with data from all resources."""
    os.makedirs(_UNITED_DIR_PATH, exist_ok=True)
    prof_names = sorted(_prof_names_in_all_resources())
    if verbose:
        print("Building movie profiles with data from all resources.")
        prof_names = tqdm(prof_names)
    for prof_name in prof_names:
        file_name = prof_name + '.json'
        imdb_prof_path = os.path.join(_IMDB_DIR_PATH, file_name)
        with open(imdb_prof_path, 'r') as imbd_prof_file:
            imdb_prof = json.load(imbd_prof_file)
        meta_prof_path = os.path.join(_METACRITIC_DIR_PATH, file_name)
        with open(meta_prof_path, 'r') as meta_prof_file:
            meta_prof = json.load(meta_prof_file)
        united_prof = {**imdb_prof, **meta_prof}
        united_prof_fpath = os.path.join(_UNITED_DIR_PATH, file_name)
        with open(united_prof_fpath, 'w+') as unite_prof_file:
            json.dump(united_prof, unite_prof_file, indent=2, sort_keys=True)
Exemplo n.º 3
0
 def test_load_unsupported_type(self):
     """Testing dump of unsupported types."""
     expected = {
         "name": "Kevin",
         "age": 21,
         "pet": {
             "name": "Trippy Jack",
             "age": 20762,
             "__type__": "hyperdimensional.hamster"
         }
     }
     with open('tests/unsupported_type.json', 'r') as json_file:
         self.assertEqual(expected, morejson.load(json_file))
Exemplo n.º 4
0
 def test_load_bad_datetime_arg(self):
     """Testing dumps of unsupported types."""
     expected = {
         "release_day": 2,
         "closing_date": {
             "bad_arg": 12,
             "month": 10,
             "year": 2013,
             "day": 18,
             "__type__": "datetime.date"
         }
     }
     with open('tests/bad_datetime_arg.json', 'r') as json_file:
         self.assertEqual(expected, morejson.load(json_file))
Exemplo n.º 5
0
 def test_dump_monkey(self):
     """Testing dumps of monkey types."""
     try:
         _build_test_dirs()
         johnny = TestDump._Monkey("Johnny", 54)
         dicti = {"my_pet": johnny}
         with open(_TEST_FILE, 'w+') as fileobj:
             morejson.dump(dicti,
                           fileobj,
                           default=TestDump._monkey_default_encoder)
         with open(_TEST_FILE, 'r') as fileobj:
             res = morejson.load(fileobj,
                                 object_hook=TestDump._monkey_object_hook)
             self.assertEqual(dicti, res)
     finally:
         _dismantle_test_dirs()
Exemplo n.º 6
0
 def test_dumps_date(self):
     """Testing dump and load of date types."""
     try:
         _build_test_dirs()
         dicti = {
             'date': datetime.date.today(),
             'array': [1, 2, 3],
             'string': 'trololo',
             'int': 1,
             'float': 4.32,
             'true': True,
             'false': False,
             'null': None
         }
         with open(_TEST_FILE, 'w+') as fileobj:
             morejson.dump(dicti, fileobj)
         with open(_TEST_FILE, 'r') as fileobj:
             self.assertEqual(dicti, morejson.load(fileobj))
     finally:
         _dismantle_test_dirs()
Exemplo n.º 7
0
 def test_dumps_complex(self):
     """Testing dump and load of complex types."""
     try:
         _build_test_dirs()
         dicti = {
             'complex1': complex(1, 34.2),
             'complex2': complex(-98.213, 91823),
             'array': [1, 2, 3],
             'string': 'trololo',
             'int': 1,
             'float': 4.32,
             'true': True,
             'false': False,
             'null': None
         }
         with open(_TEST_FILE, 'w+') as fileobj:
             morejson.dump(dicti, fileobj)
         with open(_TEST_FILE, 'r') as fileobj:
             self.assertEqual(dicti, morejson.load(fileobj))
     finally:
         _dismantle_test_dirs()
Exemplo n.º 8
0
 def test_dumps_timedelta(self):
     """Testing dump and load of timedelta types."""
     try:
         _build_test_dirs()
         dicti = {
             'timedelta1': datetime.timedelta(days=392),
             'timedelta2': datetime.timedelta(weeks=2, hours=23),
             'timedelta3': datetime.timedelta(microseconds=27836),
             'array': [1, 2, 3],
             'string': 'trololo',
             'int': 1,
             'float': 4.32,
             'true': True,
             'false': False,
             'null': None
         }
         with open(_TEST_FILE, 'w+') as fileobj:
             morejson.dump(dicti, fileobj)
         with open(_TEST_FILE, 'r') as fileobj:
             self.assertEqual(dicti, morejson.load(fileobj))
     finally:
         _dismantle_test_dirs()
Exemplo n.º 9
0
 def test_dumps_datetime_with_fold(self):
     """Testing dump and load of datetime types."""
     if sys.version_info.major < 3 or sys.version_info.minor < 6:
         return
     try:
         _build_test_dirs()
         dt = datetime.datetime(year=2012, month=10, day=10, fold=1)
         dicti = {
             'datetime': dt,
             'array': [1, 2, 3],
             'string': 'trololo',
             'int': 1,
             'float': 4.32,
             'true': True,
             'false': False,
             'null': None
         }
         with open(_TEST_FILE, 'w+') as fileobj:
             morejson.dump(dicti, fileobj)
         with open(_TEST_FILE, 'r') as fileobj:
             self.assertEqual(dicti, morejson.load(fileobj))
     finally:
         _dismantle_test_dirs()
Exemplo n.º 10
0
def build_csv(verbose):
    """Build movie dataset from united profiles."""

    # build profiles array
    profiles = []
    profile_files = os.listdir(_UNITED_DIR_PATH)
    if verbose:
        profile_files = tqdm(profile_files)
    for profile_file in profile_files:
        if verbose:
            profile_files.set_description('Reading {}'.format(profile_file))
        file_path = os.path.join(_UNITED_DIR_PATH, profile_file)
        _, ext = os.path.splitext(file_path)
        if ext == '.json':
            with open(file_path, 'r') as json_file:
                profiles.append(json.load(json_file))

    # flatten some dict or array columns
    df = pd.DataFrame(profiles)
    df = df[df['opening_weekend_date'].notnull()]
    df = holcrawl.imdb_crawl._decompose_dict_column(df, 'avg_rating_per_demo',
                                                    _DEMOGRAPHICS)
    df = holcrawl.imdb_crawl._decompose_dict_column(df, 'votes_per_demo',
                                                    _DEMOGRAPHICS)
    df = holcrawl.imdb_crawl._decompose_dict_column(
        df, 'rating_freq', [str(i) for i in range(1, 11)])
    df = holcrawl.imdb_crawl._dummy_list_column(df, 'genres')

    df['num_mc_critic'] = df.apply(
        lambda row: len(row['mc_pro_critic_reviews']), axis=1)
    df['avg_mc_critic'] = df.apply(
        _avg_review_generator('mc_pro_critic_reviews'), axis=1)
    df['num_mc_critic_by_opening'] = df.apply(
        _num_reviews_by_opening_generator('mc_pro_critic_reviews'), axis=1)
    df['avg_mc_critic_by_opening'] = df.apply(
        _avg_review_by_opening_generator('mc_pro_critic_reviews'), axis=1)

    df['num_mc_user'] = df.apply(lambda row: len(row['mc_user_reviews']),
                                 axis=1)
    df['avg_mc_user'] = df.apply(_avg_review_generator('mc_user_reviews'),
                                 axis=1)
    df['num_mc_user_by_opening'] = df.apply(
        _num_reviews_by_opening_generator('mc_user_reviews'), axis=1)
    df['avg_mc_user_by_opening'] = df.apply(
        _avg_review_by_opening_generator('mc_user_reviews'), axis=1)

    df['num_imdb_user'] = df.apply(lambda row: len(row['imdb_user_reviews']),
                                   axis=1)
    df['avg_imdb_user'] = df.apply(_avg_review_generator('imdb_user_reviews'),
                                   axis=1)
    df['num_imdb_user_by_opening'] = df.apply(
        _num_reviews_by_opening_generator('imdb_user_reviews'), axis=1)
    df['avg_imdb_user_by_opening'] = df.apply(
        _avg_review_by_opening_generator('imdb_user_reviews'), axis=1)

    df['opening_month'] = df['opening_weekend_date'].map(
        lambda opendate: opendate.month)
    df['opening_day'] = df['opening_weekend_date'].map(
        lambda opendate: opendate.day)
    df['opening_day_of_year'] = df['opening_weekend_date'].map(
        lambda opendate: opendate.timetuple().tm_yday)

    # save to file
    dataset_dir = holcrawl.shared._get_dataset_dir_path()
    os.makedirs(dataset_dir, exist_ok=True)
    csv_fpath = os.path.join(dataset_dir, 'movies_dataset.csv')
    df.to_csv(csv_fpath, index=False)