def featurize(): zero_dict_map = generate_zero_dict_map() full_data = imdb.load_json('parsed.json') train = filter(lambda x: (x['year'] < 2009), full_data) test = filter(lambda x: (x['year'] >= 2009), full_data) ERM2009 = imdb.load_json('ERM2009.json') ERM_AVG = median(list((zip(*ERM2009.items()))[1])) print 'ERM_MED', ERM_AVG def dvectorize(kind, data): zdict = deepcopy(zero_dict_map[kind]) if type(data) is not list: data = [data] for feature in data: if feature in zdict: zdict[feature] = 1 return zdict def apply_featurization(movie): for case in ['genres', 'mpaa', 'month']: case_dict = dvectorize(case, movie[case]) for key in case_dict.keys(): movie[key] = case_dict[key] for j in range(len(movie['stars'])): actor_name = movie['stars'][j]['name'] if actor_name in ERM2009: movie['actor' + str(j)] = ERM2009[actor_name] else: movie['actor' + str(j)] = ERM_AVG for e_field in E_FIELDS: VALS = [] for entity in movie[e_field]: if entity['name'] in ERM2009: VALS.append(ERM2009[entity['name']]) else: VALS.append(ERM_AVG) movie[e_field] = avg_float_list(VALS) for thing in DELETE_THESE: del movie[thing] return movie featurized_train = map(apply_featurization, train) featurized_test = map(apply_featurization, test) with open('featurized4_train.json', 'w') as f: json.dump(featurized_train, f, sort_keys=True, indent=4) with open('featurized4_test.json', 'w') as f: json.dump(featurized_test, f, sort_keys=True, indent=4)
def featurize(): zero_dict_map = generate_zero_dict_map() full_data = imdb.load_json('parsed.json') train = filter(lambda x: (x['year'] < 2009), full_data) test = filter(lambda x: (x['year'] >= 2009), full_data) ERM2009 = imdb.load_json('ERM2009.json') ERM_AVG = median(list((zip(*ERM2009.items()))[1])) print 'ERM_MED', ERM_AVG def dvectorize(kind, data): zdict = deepcopy(zero_dict_map[kind]) if type(data) is not list: data = [data] for feature in data: if feature in zdict: zdict[feature] = 1 return zdict def apply_featurization(movie): for case in ['genres', 'mpaa', 'month']: case_dict = dvectorize(case, movie[case]) for key in case_dict.keys(): movie[key] = case_dict[key] for j in range(len(movie['stars'])): actor_name = movie['stars'][j]['name'] if actor_name in ERM2009: movie['actor'+str(j)] = ERM2009[actor_name] else: movie['actor'+str(j)] = ERM_AVG for e_field in E_FIELDS: VALS = [] for entity in movie[e_field]: if entity['name'] in ERM2009: VALS.append(ERM2009[entity['name']]) else: VALS.append(ERM_AVG) movie[e_field] = avg_float_list(VALS) for thing in DELETE_THESE: del movie[thing] return movie featurized_train = map(apply_featurization, train) featurized_test = map(apply_featurization, test) with open('featurized4_train.json', 'w') as f: json.dump(featurized_train, f, sort_keys = True, indent = 4) with open('featurized4_test.json', 'w') as f: json.dump(featurized_test, f, sort_keys = True, indent = 4)
def filter_data(): data = imdb.load_json('master.json') newdata = sorted(data, key=lambda k: k['year']) newdata.pop() # get rid of last movie with no fields def assign_year(movie): # parse string year year = int(str(movie['year'][1:5])) movie['year'] = year # process year field return movie filtered = sorted(filter(my_filter, map(assign_year, newdata)), key=lambda k: k['year']) print 'filtered length of %d' % len(filtered) return filtered
def make_nice_graph(cls): gdata = imdb.load_json('classify_results.json') for key in gdata.keys(): model = key.split('(')[0] scores = gdata[key][cls] plt.plot(range(len(scores)), scores, label=model, linewidth=2) if '0' in cls: title = '< 100M Revenue (Class Label 0)' else: title = '100M+ Revenue (Class Label 1)' plt.legend() plt.suptitle(title) plt.xlabel('Time Step') plt.ylabel('Classification Accuracy') plt.show()
import imdb, json, math #from textblob import TextBlob from sklearn.svm import SVC import matplotlib.pyplot as plt from copy import deepcopy from featurize import avg_float_list, MONTH_LIST from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso data = sorted(imdb.load_json('parsed.json'), key=lambda k: k['ordinaldate']) fields = ['stars', 'directors', 'writers', 'production'] kmap = { 'month' : MONTH_LIST, 'mpaa' : ['G', 'PG_13', 'R', 'PG'], 'genres' : imdb.load_json('genres.json') } M100, M50 = 100000000, 50000000 def labelbinarize(data): imap = {} for i in range(len(data)): imap[data[i]] = i return imap def getLBMap(): LBMap = {} for key in kmap.keys(): LBMap[key] = labelbinarize(kmap[key]) return LBMap
def generate_zero_dict_map(): return { 'genres' : zero_dict_from_list(imdb.load_json('genres.json')), 'month' : zero_dict_from_list(MONTH_LIST), 'mpaa' : zero_dict_from_list(['G', 'PG_13', 'R', 'PG']) }
import imdb, json, math #from textblob import TextBlob from sklearn.svm import SVC import matplotlib.pyplot as plt from copy import deepcopy from featurize import avg_float_list, MONTH_LIST from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB, BernoulliNB from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso data = sorted(imdb.load_json('parsed.json'), key=lambda k: k['ordinaldate']) fields = ['stars', 'directors', 'writers', 'production'] kmap = { 'month': MONTH_LIST, 'mpaa': ['G', 'PG_13', 'R', 'PG'], 'genres': imdb.load_json('genres.json') } M100, M50 = 100000000, 50000000 def labelbinarize(data): imap = {} for i in range(len(data)): imap[data[i]] = i return imap def getLBMap():