예제 #1
0
def featurize():
    zero_dict_map = generate_zero_dict_map()

    full_data = imdb.load_json('parsed.json')
    train = filter(lambda x: (x['year'] < 2009), full_data)
    test = filter(lambda x: (x['year'] >= 2009), full_data)

    ERM2009 = imdb.load_json('ERM2009.json')

    ERM_AVG = median(list((zip(*ERM2009.items()))[1]))
    print 'ERM_MED', ERM_AVG

    def dvectorize(kind, data):
        zdict = deepcopy(zero_dict_map[kind])
        if type(data) is not list: data = [data]
        for feature in data:
            if feature in zdict: zdict[feature] = 1
        return zdict

    def apply_featurization(movie):

        for case in ['genres', 'mpaa', 'month']:
            case_dict = dvectorize(case, movie[case])
            for key in case_dict.keys():
                movie[key] = case_dict[key]

        for j in range(len(movie['stars'])):
            actor_name = movie['stars'][j]['name']
            if actor_name in ERM2009:
                movie['actor' + str(j)] = ERM2009[actor_name]
            else:
                movie['actor' + str(j)] = ERM_AVG

        for e_field in E_FIELDS:
            VALS = []
            for entity in movie[e_field]:
                if entity['name'] in ERM2009:
                    VALS.append(ERM2009[entity['name']])
                else:
                    VALS.append(ERM_AVG)

            movie[e_field] = avg_float_list(VALS)

        for thing in DELETE_THESE:
            del movie[thing]

        return movie

    featurized_train = map(apply_featurization, train)
    featurized_test = map(apply_featurization, test)

    with open('featurized4_train.json', 'w') as f:
        json.dump(featurized_train, f, sort_keys=True, indent=4)

    with open('featurized4_test.json', 'w') as f:
        json.dump(featurized_test, f, sort_keys=True, indent=4)
예제 #2
0
def featurize():
	zero_dict_map = generate_zero_dict_map()

	full_data = imdb.load_json('parsed.json')
	train = filter(lambda x: (x['year'] < 2009), full_data)
	test = filter(lambda x: (x['year'] >= 2009), full_data)

	ERM2009 = imdb.load_json('ERM2009.json')

	ERM_AVG = median(list((zip(*ERM2009.items()))[1]))
	print 'ERM_MED', ERM_AVG

	def dvectorize(kind, data):
		zdict = deepcopy(zero_dict_map[kind])
		if type(data) is not list: data = [data]
		for feature in data:
			if feature in zdict: zdict[feature] = 1
		return zdict

	def apply_featurization(movie):

		for case in ['genres', 'mpaa', 'month']:
			case_dict = dvectorize(case, movie[case])
			for key in case_dict.keys(): 
				movie[key] = case_dict[key]

		for j in range(len(movie['stars'])):
			actor_name = movie['stars'][j]['name']
			if actor_name in ERM2009:
				movie['actor'+str(j)] = ERM2009[actor_name]
			else: movie['actor'+str(j)] = ERM_AVG

		for e_field in E_FIELDS:
			VALS = []
			for entity in movie[e_field]:
				if entity['name'] in ERM2009:
					VALS.append(ERM2009[entity['name']])
				else: VALS.append(ERM_AVG)

			movie[e_field] = avg_float_list(VALS)

		for thing in DELETE_THESE: del movie[thing]

		return movie

	featurized_train = map(apply_featurization, train)
	featurized_test = map(apply_featurization, test)

	with open('featurized4_train.json', 'w') as f:
		json.dump(featurized_train, f, sort_keys = True, indent = 4)

	with open('featurized4_test.json', 'w') as f:
		json.dump(featurized_test, f, sort_keys = True, indent = 4)
예제 #3
0
파일: filter.py 프로젝트: rs93/predictimdb
def filter_data():

    data = imdb.load_json('master.json')
    newdata = sorted(data, key=lambda k: k['year'])
    newdata.pop() # get rid of last movie with no fields

    def assign_year(movie): # parse string year
        year = int(str(movie['year'][1:5]))
        movie['year'] = year # process year field
        return movie

    filtered = sorted(filter(my_filter, map(assign_year, newdata)), key=lambda k: k['year'])
    print 'filtered length of %d' % len(filtered)
    return filtered
예제 #4
0
def filter_data():

    data = imdb.load_json('master.json')
    newdata = sorted(data, key=lambda k: k['year'])
    newdata.pop() # get rid of last movie with no fields

    def assign_year(movie): # parse string year
        year = int(str(movie['year'][1:5]))
        movie['year'] = year # process year field
        return movie

    filtered = sorted(filter(my_filter, map(assign_year, newdata)), key=lambda k: k['year'])
    print 'filtered length of %d' % len(filtered)
    return filtered
예제 #5
0
def make_nice_graph(cls):

	gdata = imdb.load_json('classify_results.json')

	for key in gdata.keys():
		model = key.split('(')[0]
		scores = gdata[key][cls]
		plt.plot(range(len(scores)), scores, label=model, linewidth=2)

	if '0' in cls:
		title = '< 100M Revenue (Class Label 0)'
	else: title = '100M+ Revenue (Class Label 1)'
	plt.legend()
	plt.suptitle(title)
	plt.xlabel('Time Step')
	plt.ylabel('Classification Accuracy')
	plt.show()
예제 #6
0
def make_nice_graph(cls):

    gdata = imdb.load_json('classify_results.json')

    for key in gdata.keys():
        model = key.split('(')[0]
        scores = gdata[key][cls]
        plt.plot(range(len(scores)), scores, label=model, linewidth=2)

    if '0' in cls:
        title = '< 100M Revenue (Class Label 0)'
    else:
        title = '100M+ Revenue (Class Label 1)'
    plt.legend()
    plt.suptitle(title)
    plt.xlabel('Time Step')
    plt.ylabel('Classification Accuracy')
    plt.show()
예제 #7
0
import imdb, json, math
#from textblob import TextBlob
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from copy import deepcopy
from featurize import avg_float_list, MONTH_LIST
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso

data = sorted(imdb.load_json('parsed.json'), key=lambda k: k['ordinaldate'])

fields = ['stars', 'directors', 'writers', 'production']

kmap = { 'month' : MONTH_LIST,
		 'mpaa' : ['G', 'PG_13', 'R', 'PG'],
		 'genres' : imdb.load_json('genres.json') }

M100, M50 = 100000000, 50000000

def labelbinarize(data):
	imap = {}
	for i in range(len(data)): 
		imap[data[i]] = i
	return imap

def getLBMap():
	LBMap = {}
	for key in kmap.keys():
		LBMap[key] = labelbinarize(kmap[key])
	return LBMap
예제 #8
0
def generate_zero_dict_map():
    return { 'genres' : zero_dict_from_list(imdb.load_json('genres.json')),
             'month' : zero_dict_from_list(MONTH_LIST),
             'mpaa' : zero_dict_from_list(['G', 'PG_13', 'R', 'PG']) }
예제 #9
0
import imdb, json, math
#from textblob import TextBlob
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from copy import deepcopy
from featurize import avg_float_list, MONTH_LIST
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso

data = sorted(imdb.load_json('parsed.json'), key=lambda k: k['ordinaldate'])

fields = ['stars', 'directors', 'writers', 'production']

kmap = {
    'month': MONTH_LIST,
    'mpaa': ['G', 'PG_13', 'R', 'PG'],
    'genres': imdb.load_json('genres.json')
}

M100, M50 = 100000000, 50000000


def labelbinarize(data):
    imap = {}
    for i in range(len(data)):
        imap[data[i]] = i
    return imap


def getLBMap():