예제 #1
0
def traverse_movies_OLS():
	LBMAP = getLBMap()
	DMAP = createEmpty()

	P_ERRORS, ERRORS = [], []

	training_data, training_response = [], []

	for i in range(len(data)):

		movie = data[i]
		m_rev = movie['revenue']

		myvector = vectorizeMovie(movie, LBMAP, DMAP)

		if i > 100:
			model = LinearRegression()
			model.fit(training_data, training_response)
			raw = math.fabs(model.predict(myvector) - m_rev)
			ERRORS.append(raw)
			P_ERRORS.append(round(raw/m_rev, 4))
		
		training_data.append(myvector)
		training_response.append(m_rev)

		DMAP = update(movie, DMAP)

	print 'all', avg_float_list(P_ERRORS)
	print 'all', avg_float_list(ERRORS)
	print 'all', avg_float_list(ERRORS[3200:])
예제 #2
0
def traverse_movies_OLS():
    LBMAP = getLBMap()
    DMAP = createEmpty()

    P_ERRORS, ERRORS = [], []

    training_data, training_response = [], []

    for i in range(len(data)):

        movie = data[i]
        m_rev = movie['revenue']

        myvector = vectorizeMovie(movie, LBMAP, DMAP)

        if i > 100:
            model = LinearRegression()
            model.fit(training_data, training_response)
            raw = math.fabs(model.predict(myvector) - m_rev)
            ERRORS.append(raw)
            P_ERRORS.append(round(raw / m_rev, 4))

        training_data.append(myvector)
        training_response.append(m_rev)

        DMAP = update(movie, DMAP)

    print 'all', avg_float_list(P_ERRORS)
    print 'all', avg_float_list(ERRORS)
    print 'all', avg_float_list(ERRORS[3200:])
예제 #3
0
def vectorizeMovie(movie, LBMap, DMAP, Discrete=False, Sentiment=False):
	OUT = []

	for field in ['month', 'mpaa', 'genres']:
		OUT += getVector(field, movie[field], LBMap)

	OUT += [discretize(movie['runtime'], 90, Discrete)]
	OUT += [discretize(movie['budget'], M50, Discrete)]

	for entity in movie['stars']:
		e_name = entity['name']
		v = DMAP['stars'][e_name]
		for subfield in ['high', 'avg']:

			OUT += [discretize(v[subfield], M100, Discrete)]

	for field in ['directors', 'writers', 'production']:
		ls_vals = []
		for entity in movie[field]:
			e_name = entity['name']
			ls_vals += [DMAP[field][e_name]['avg']]
		OUT += [discretize(avg_float_list(ls_vals), M100, Discrete)]

	if Sentiment:
		tag = movie['tagline']
		if tag:
			tblob = TextBlob(tag.strip().encode('ascii', 'ignore'))
			if float(tblob.sentiment.polarity) >= 0.0: 
				OUT += [1]
			else: OUT += [0]
		else: OUT += [1]

	return OUT
예제 #4
0
def vectorizeMovie(movie, LBMap, DMAP, Discrete=False, Sentiment=False):
    OUT = []

    for field in ['month', 'mpaa', 'genres']:
        OUT += getVector(field, movie[field], LBMap)

    OUT += [discretize(movie['runtime'], 90, Discrete)]
    OUT += [discretize(movie['budget'], M50, Discrete)]

    for entity in movie['stars']:
        e_name = entity['name']
        v = DMAP['stars'][e_name]
        for subfield in ['high', 'avg']:

            OUT += [discretize(v[subfield], M100, Discrete)]

    for field in ['directors', 'writers', 'production']:
        ls_vals = []
        for entity in movie[field]:
            e_name = entity['name']
            ls_vals += [DMAP[field][e_name]['avg']]
        OUT += [discretize(avg_float_list(ls_vals), M100, Discrete)]

    if Sentiment:
        tag = movie['tagline']
        if tag:
            tblob = TextBlob(tag.strip().encode('ascii', 'ignore'))
            if float(tblob.sentiment.polarity) >= 0.0:
                OUT += [1]
            else:
                OUT += [0]
        else:
            OUT += [1]

    return OUT
예제 #5
0
def graph_regression():

    results = {
        'Overall': [0, 0, []],
        'All Correct': [0, 0, []],
        '<100M Correct': [0, 0, []],
        '100M+ Correct': [0, 0, []]
    }

    with open('combined.csv', 'r') as f:

        for line in f:

            vals = line.split(',')
            true, pred, err = vals[0], vals[1], float(vals[2])

            all_c, c0_c, c1_c = False, False, False

            if true == pred:
                all_c = True

                if true == '1':
                    c1_c = True
                else:
                    c0_c = True

            for key in results.keys():

                if ((key == 'Overall') or (key == 'All Correct' and all_c)
                        or (key == '<100M Correct' and c0_c)
                        or (key == '100M+ Correct' and c1_c)):

                    n, ravg = results[key][0], results[key][1]

                    new_avg = ((n * ravg) + err) / float(n + 1)

                    results[key][0], results[key][1] = (n + 1), new_avg

                    results[key][2] += [new_avg]

                else:

                    results[key][0] += 1
                    results[key][2] += [results[key][1]]

    for key in ['Overall', 'All Correct', '<100M Correct', '100M+ Correct']:
        model, scores = key, results[key][2]
        print model, avg_float_list(scores)
        plt.plot(range(len(scores)), scores, label=model, linewidth=2)

    plt.legend()
    plt.suptitle('Regression Performance on Correct Classifications')
    plt.xlabel('Time Step')
    plt.ylabel('Mean Absolute Error')
    plt.ylim(10000000, 60000000)
    plt.show()
예제 #6
0
def graph_regression():

	results = { 'Overall' : [0, 0, []],
				'All Correct' : [0, 0, []],
				'<100M Correct' : [0, 0, []],
				'100M+ Correct' : [0, 0, []] }

	with open('combined.csv', 'r') as f:

		for line in f:

			vals = line.split(',')
			true, pred, err = vals[0], vals[1], float(vals[2])

			all_c, c0_c, c1_c = False, False, False

			if true == pred:
				all_c = True

				if true == '1':
					c1_c = True
				else: c0_c = True

			for key in results.keys():

				if ((key == 'Overall') or (key == 'All Correct' and all_c) 
					or (key == '<100M Correct' and c0_c) or (key == '100M+ Correct' and c1_c)):

					n, ravg = results[key][0], results[key][1]

					new_avg = ((n * ravg) + err) / float(n+1)

					results[key][0], results[key][1] = (n + 1), new_avg

					results[key][2] += [new_avg]

				else:

					results[key][0] += 1
					results[key][2] += [results[key][1]]

	for key in ['Overall', 'All Correct', '<100M Correct', '100M+ Correct']:
		model, scores = key, results[key][2]
		print model, avg_float_list(scores)
		plt.plot(range(len(scores)), scores, label=model, linewidth=2)


	plt.legend()
	plt.suptitle('Regression Performance on Correct Classifications')
	plt.xlabel('Time Step')
	plt.ylabel('Mean Absolute Error')
	plt.ylim(10000000, 60000000)
	plt.show()