-
Notifications
You must be signed in to change notification settings - Fork 0
/
reviewCorrelation.py
111 lines (96 loc) · 3.76 KB
/
reviewCorrelation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import logging
import os
import numpy as np
import matplotlib.pylab as plt
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import func
from models import Review
from models import Genre
def getCorrelationCoef(setA, setB, numTests):
corrCoef = 0
for i in range(numTests):
bestMusic = np.random.choice(setA, size=len(setB))
corrCoef += np.corrcoef(bestMusic, setB)[0][1]
return corrCoef / numTests
# enviornment variables setup
load_dotenv()
PSQL_CONNECTION_STRING = os.getenv("PSQL_CONNECTION_STRING")
# logging setup. See sqlalchemy.log to view generated sql statements
handler = logging.FileHandler("sqlalchemy.log")
handler.setLevel(logging.INFO)
logger = logging.getLogger("sqlalchemy")
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
if __name__ == "__main__":
engine = create_engine(PSQL_CONNECTION_STRING, echo=False)
Session = sessionmaker(bind=engine)
session = Session()
# query get average score of ratings for each year
averageScores = session.query(func.avg(Review.score), Review.pub_year).group_by(Review.pub_year).all()
# query get best new music scores are scores for somgs that weren't best new music
bestNewMusicScores = session.query(Review.score).filter(Review.best_new_music == 1).all()
bestNewMusicOrNotScores = (
session.query(Review.best_new_music, func.array_agg(Review.score)).group_by(Review.best_new_music).all()
)
bestNewMusicByYear = (
session.query(Review.pub_year, func.array_agg(Review.score))
.group_by(Review.pub_year)
.filter(Review.best_new_music == 1)
.all()
)
# Join query for Genre and Review to see reviews by genre
averageReviewsScoreByGenre = (
session.query(Genre.genre, func.avg(Review.score))
.filter(Genre.genre != None, Review.best_new_music == 1)
.join(Review, Genre.reviewid == Review.reviewid)
.group_by(Genre.genre)
.all()
)
# Lists of scores by genre
allScoresByGenre = (
session.query(Genre.genre, func.array_agg(Review.score))
.filter(Genre.genre != None)
.join(Review, Genre.reviewid == Review.reviewid)
.group_by(Genre.genre)
.all()
)
# Lists of scores by year
allScoresByYear = session.query(Review.pub_year, func.array_agg(Review.score)).group_by(Review.pub_year).all()
# query get average score of ratings for each author type
averageScoresByAuthorType = (
session.query(Review.author_type, func.array_agg(Review.score))
.group_by(Review.author_type)
.filter(Review.author_type != None)
.all()
)
for i in averageScoresByAuthorType:
print("Author Type: ", i[0])
print("bestNewMusicScores vs ", i[0], ": ", getCorrelationCoef(i[1], bestNewMusicOrNotScores[1][1], 1000))
print("NotBestNewMusic vs ", i[0], ": ", getCorrelationCoef(i[1], bestNewMusicOrNotScores[0][1], 1000))
print("---")
"""
for i in allScoresByGenre:
print("Genre: ", i[0])
print("bestNewMusicScores vs ", i[0], ": ", getCorrelationCoef(i[1], bestNewMusicOrNotScores[1][1], 1000))
print("NotBestNewMusic vs ", i[0], ": ", getCorrelationCoef(i[1], bestNewMusicOrNotScores[0][1], 1000))
print("---")
for i in allScoresByYear:
scoresForYear = []
for j in bestNewMusicByYear:
if j[0] == i[0]:
scoresForYear = j
if scoresForYear == []:
continue
print("Year: ", i[0])
print(
"bestNewMusicScores in ",
scoresForYear[0],
" vs ",
i[0],
" scores: ",
getCorrelationCoef(i[1], scoresForYear[1], 1000),
)
print("---")
"""