def main(): if '-h' in sys.argv: print_usage_message() exit() vectorFileName = fn.create_prof_vect_name(sys.argv) corrFileName = fn.create_correlations_name(sys.argv) if not os.path.exists(corrFileName): tokenVects = read.word_vects(vectorFileName) if tokenVects is None: print("Specified vector file not found.") print("To create vectors use 'createProfVectors.py'") exit() ratingVect = read.overall_rating_vect(vectorFileName) vocabVect = read.vocab_from_vect_file(vectorFileName) corrTups = stat.find_correlations(tokenVects, ratingVect, vocabVect) write.token_correlations(corrTups, corrFileName) else: corrTups = read.token_correlations(corrFileName) corrPlotFileName = None if '-save' in sys.argv: corrPlotFileName = fn.create_correlations_plot_name(sys.argv) # Plot correlations plot.tuple_pair_score_correlation( corrTups, title=plot.create_token_pair_score_correlation_name(sys.argv), saveFile=corrPlotFileName)
def create_prof_vectors(tokenSchema, argv, profDicts=None, profTokenDict=None): """ Create token count vectors for the aggrigate reviews of each professor. """ if profDicts is None: profDicts = read.prof_dicts() if profTokenDict is None: ptdName = fn.create_prof_token_dict_name(argv) profTokenDict = read.prof_token_dicts(ptdName) schemaDict = value_idx_dict(tokenSchema) profVects = [] pidsNotIncluded = [] for prof in profDicts: newVect = create_prof_vector( prof, count.combine_rev_counters(profTokenDict[prof['pid']]), schemaDict) if newVect['token_vect'] is None: pidsNotIncluded.append(newVect['pid']) else: profVects.append(newVect) pidsNotIncluded.sort() return profVects, pidsNotIncluded
def main(): if '-h' in sys.argv: print_usage_message() exit() tokenSchema = vp.create_token_schema(sys.argv) profVects, pidsNotIncl = vp.create_prof_vectors(tokenSchema, sys.argv) profVectFileName = fn.create_prof_vect_name(sys.argv, True) write.prof_vects(profVects, pidsNotIncl, tokenSchema, profVectFileName)
def process_token_vectors(vects, argv): if '-tf' in argv: vects = np.apply_along_axis(to_tf_vect, 1, vects) elif '-tfidf' in argv: vocab = read.vocab_from_vect_file(fn.create_prof_vect_name(argv)) idfVect = create_idf_vect(vocab, vects.shape[0], argv) print(idfVect.shape, vects.shape) vects = np.apply_along_axis(lambda x: to_tf_idf_vect(x, idfVect), 1, vects) return vects
def main(): if '-h' in sys.argv: print_usage_message() exit() stmr = None stopwords = None if '-ss' in sys.argv: stmr = LancasterStemmer() stopwords = read.stopwords(stmr) countNames = fn.create_token_count_names(sys.argv) rawTokenCountName = countNames[0] revTokenCountName = countNames[1] profTokenCountName = countNames[2] rawTokens = read.token_count(rawTokenCountName, True) revTokens = read.token_count(revTokenCountName, True) profTokens = read.token_count(profTokenCountName, True) if rawTokens == None or revTokens == None or profTokens == None: profTokenDict = grab_prof_token_dict(stopwords, stmr) if rawTokens == None: rawTokens = grab_token_count(profTokenDict, count.num_tokens, rawTokenCountName) if revTokens == None: revTokens = grab_token_count(profTokenDict, count.num_reviews_with_token, revTokenCountName) if profTokens == None: profTokens = grab_token_count(profTokenDict, count.num_profs_with_token, profTokenCountName) plotName = create_plot_name() plotFileName = None if '-save' in sys.argv: plotFileName = fn.create_count_plot_name(sys.argv) plot.token_counts(rawTokens, revTokens, profTokens, plotFileName, plotName)
def token_schema_from_count(argv): countsFileName = fn.create_token_count_names(argv) countsFileName = countsFileName[1] # Num revs token appears in tokenCounts = read.token_count(countsFileName) if tokenCounts is None: print("Token count file not found.") print("Create token count file using 'countTokens.py'") exit() minCount = int(argv[argv.index('-minCount') + 1]) tokenSchema = [t for t, c in tokenCounts.items() if c >= minCount] tokenSchema.sort() return tokenSchema
def create_idf_vect(vocab, numProfs, argv): """ vocab is expected to be a python list """ countFileName = fn.create_token_count_names(argv) countFileName = countFileName[2] tokCounts = read.token_count(countFileName) countVect = np.zeros(len(vocab), dtype=float) for idx, word in enumerate(vocab): countVect[idx] = tokCounts[word] return np.log(numProfs / countVect)
def create_rev_vectors(tokenSchema, argv, profDicts=None, profTokenDict=None): if profDicts is None: profDicts = read.prof_dicts() if profTokenDict is None: ptdName = fn.create_prof_token_dict_name(argv) profTokenDict = read.prof_token_dicts(ptdName) schemaDict = value_idx_dict(tokenSchema) revVects = [] for prof in profDicts: for rev in prof['reviews']: revVects.append(create_rev_vector(rev, schemaDict))
def token_schema_from_correlations(argv): corrFileName = fn.create_correlations_name(argv) corrTups = read.token_correlations(corrFileName) if corrTups is None: print("Correlations file not found") print("Create correlations file with 'findCorrelations.py'") exit() corIdx = argv.index('-corr') minCount = int(argv[corIdx + 1]) minScore = float(argv[corIdx + 2]) reducedTups = [(cor[0], cor[1]) for cor in corrTups if cor[2] >= minCount and cor[3] >= abs(minScore)] tokenSet = set() for tok1, tok2 in reducedTups: tokenSet.add(tok1) tokenSet.add(tok2) tokenSchema = list(tokenSet) tokenSchema.sort() return tokenSchema
def grab_prof_token_dict(stopwords, stmr): filename = fn.create_prof_token_dict_name(sys.argv) if os.path.exists(filename): with open(filename, 'rb') as f: profTokenDict = pickle.load(f) return profTokenDict token_f = lambda t: count.create_single_tokens(t, stopwords, stmr) if '-tup' in sys.argv: token_f = lambda t: count.create_tuple_tokens(t, stopwords, stmr) elif '-stup' in sys.argv: token_f = ( lambda t: count.create_single_tuple_tokens(t, stopwords, stmr)) profs = read.prof_dicts() profTokenDict = count.create_prof_token_dict(profs, token_f) with open(filename, 'wb') as f: pickle.dump(profTokenDict, f) return profTokenDict
def main(): if '-h' in sys.argv: print_usage_message() exit() vectFileName = fn.create_prof_vect_name(sys.argv, True) simMatFileName = fn.create_sim_mat_name(sys.argv) predsFileName = fn.create_preds_name(sys.argv) print(vectFileName) print(simMatFileName) print(predsFileName) # Grab the ratings vector if '-d' in sys.argv: ratings = read.difficulty_rating_vect(vectFileName) else: ratings = read.overall_rating_vect(vectFileName) # Assign similarity metric sim_f = vp.inverse_euclidean_distance if '-cos' in sys.argv: sim_f = vp.cosine_similarity elif '-pear' in sys.argv: sim_f = vp.abs_pearson_correlation # Set if weighted or not weighted = True if '-unweighted' in sys.argv: weighted = False # Grab predictions or create them if not available predictions = read.knn_predictions(predsFileName) if predictions is None: simMat = read.similarity_matrix(simMatFileName) if simMat is None: wordVects = read.word_vects(vectFileName) if wordVects is None: print("Vector file " + vectFileName + " does not exist") exit() wordVects = vp.process_token_vectors(wordVects, sys.argv) simMat = knn.get_similarity_matrix(wordVects, sim_f) write.similarity_matrix(simMat, simMatFileName) predictions = knn.knn_dataset(ratings, MaxK, simMat, weighted) write.knn_predictions(predictions, predsFileName) idxToPlot = None if '-maxK' in sys.argv: maxK = int(sys.argv[sys.argv.index('-maxK') + 1]) predictions = predictions[:, :maxK] pidVect = read.pid_vect(vectFileName) singleRevIdxs = vp.pids_to_idxs(pidVect, read.pids_file(fn.PidsSingleRevFile)) smallLenIdxs = vp.pids_to_idxs(pidVect, read.pids_file(fn.PidsSmallRevLenFile)) plotFileName = None if '-save' in sys.argv: plotFileName = fn.create_knn_accuracy_plot_name(sys.argv) # Output results of the run plot.knn_error( predictions, ratings, title=plot.create_knn_error_title(sys.argv), idxToPlot=[singleRevIdxs, smallLenIdxs], subTitles=[ "Error with profs with one review", "Error with profs with aggrigate review " + "lengths one std div above the mean " + "review length or less" ], saveFile=plotFileName)
def main(): if '-h' in sys.argv: print_usage_message() exit() vectorFileName = fn.create_prof_vect_name(sys.argv, True) tokenVects = read.word_vects(vectorFileName) if tokenVects is None: print("Could not find token vects") print("Use 'createProfVectors.py' to create vectors") exit() tokenVects = vp.process_token_vectors(tokenVects, sys.argv) if '-d' in sys.argv: ratings = read.difficulty_rating_vect(vectorFileName) else: ratings = read.overall_rating_vect(vectorFileName) # Create Training and validation sets pidVect = read.pid_vect(vectorFileName) nonSingleSmallIdxs = ffnn.non_single_small_idxs(pidVect) singleIdxs = vp.pids_to_idxs(pidVect, read.pids_file(fn.PidsSingleRevFile)) smallIdxs = vp.pids_to_idxs(pidVect, read.pids_file(fn.PidsSmallRevLenFile)) singleSmallIdxs = list(set(singleIdxs).union(set(smallIdxs))) singleSmallIdxs.sort() singleSmallIdxs = np.array(singleSmallIdxs) trainingVects = tokenVects[nonSingleSmallIdxs, :] trainingRatings = ratings[nonSingleSmallIdxs] validVects = tokenVects[singleSmallIdxs, :] validRatings = ratings[singleSmallIdxs] print(trainingVects.shape, trainingRatings.shape, validVects.shape, validRatings.shape) """ xTrain, xValid, yTrain, yValid = train_test_split(tokenVects, ratings, test_size=0.3) """ # Select and train model if '-deep' in sys.argv: model = ffnn.deep_model(tokenVects.shape[1]) else: model = ffnn.shallow_model(tokenVects.shape[1]) history = model.fit(trainingVects, trainingRatings, epochs=10, batch_size=5, validation_data=(validVects, validRatings)) plotTitle = plot.ffnn_error_title(sys.argv) outfile = None if '-save' in sys.argv: outfile = fn.create_ffnn_plot_name(sys.argv) plot.ffnn_error(history, title=plotTitle, filename=outfile)