def clean_review(review): course = review['class'] if course is None: return {} course = course.lower() matches = re.findall(r'([a-z]+).*?([0-9]{3}[a-z]?)(?:[^0-9]|$)', course) # TODO(mack): investigate if we are missing any good courses with # this regex if len(matches) != 1 or len(matches[0]) != 2: return {} department_id = matches[0][0].lower() course_number = matches[0][1].lower() course_id = department_id + course_number prof_name = get_prof_name(data['prof_name']) prof_id = m.Professor.get_id_from_name(prof_name['first_name'], prof_name['last_name']) clean_review = { 'professor_id': prof_id, 'course_id': course_id, 'course_review': m.CourseReview(), 'professor_review': m.ProfessorReview(), } def normalize_rating(menlo_rating): # normalize 1..5 to Yes/No: # 1,2 => No, 3 => None, 4,5 => Yes try: menlo_rating = int(menlo_rating) if menlo_rating <= 2: return 0 elif menlo_rating >= 4: return 1 else: return None except: return None # TODO(mack): include 'r_helpful'? if 'r_clarity' in review: clean_review['professor_review'].clarity = \ normalize_rating(review['r_clarity']) if 'r_easy' in review: clean_review['course_review'].easiness = \ normalize_rating(review['r_easy']) if 'r_interest' in review: clean_review['course_review'].interest = \ normalize_rating(review['r_interest']) clean_review['professor_review'].comment = review['comment'] clean_review['professor_review'].comment_date = datetime.strptime( review['date'], '%m/%d/%y') return clean_review
def generic_stats(): num_ucs = m.UserCourse.objects().count() num_users = m.User.objects.count() num_users_with_transcript = m.User.objects( transcripts_imported__gt=0).count() num_users_with_schedule = m.User.objects(schedules_imported__gt=0).count() num_course_reviews = m.UserCourse.objects( course_review__comment__ne='').count() num_professor_reviews = m.UserCourse.objects( professor_review__comment__ne='').count() num_courses = m.Course.objects.count() # TODO(david): Make rating_fields a class method num_course_ratings = 0 for rating in m.CourseReview().rating_fields(): query = {'course_review__%s__ne' % rating: None} num_course_ratings += m.UserCourse.objects(**query).count() num_professor_ratings = 0 for rating in m.ProfessorReview().rating_fields(): query = {'professor_review__%s__ne' % rating: None} num_professor_ratings += m.UserCourse.objects(**query).count() q = Q() for rating in m.CourseReview().rating_fields(): q |= Q(**{'course_review__%s__ne' % rating: None}) for rating in m.ProfessorReview().rating_fields(): q |= Q(**{'professor_review__%s__ne' % rating: None}) q |= Q(course_review__comment__ne='') q |= Q(professor_review__comment__ne='') ucs_rated_reviewed = m.UserCourse.objects.filter(q) num_ucs_rated_reviewed = ucs_rated_reviewed.count() # Take intersection of all_course_ids and reviewed_course_ids just in case rated_reviewed_course_ids = set(ucs_rated_reviewed.distinct('course_id')) # TODO(mduan): Verify that # (rated_reviewed_course_ids - all_course_ids) # is empty num_courses_rated_reviewed = len(rated_reviewed_course_ids) # Take intersection of all_course_ids and reviewed_course_ids just in case rated_reviewed_user_ids = set(ucs_rated_reviewed.distinct('user_id')) # TODO(mduan): Verify that (rated_reviewed_user_ids - all_course_ids) # is empty num_users_rated_reviewed = len(rated_reviewed_user_ids) yesterday = datetime.now() - timedelta(hours=24) signups = users_joined_after(yesterday) result = { 'num_users': num_users, 'num_signups_today': signups, 'num_users_with_transcript': num_users_with_transcript, 'num_users_with_schedule': num_users_with_schedule, 'num_ratings': num_course_ratings + num_professor_ratings, 'num_reviews': num_course_reviews + num_professor_reviews, 'num_ucs': num_ucs, 'num_ucs_rated_reviewed': num_ucs_rated_reviewed, 'num_courses': num_courses, 'num_courses_rated_reviewed': num_courses_rated_reviewed, 'num_users_rated_reviewed': num_users_rated_reviewed, 'num_signups_start_time': yesterday, 'epoch': datetime.now(), } return result