def main():

    if '-h' in sys.argv:
        print_usage_message()
        exit()

    vectorFileName = fn.create_prof_vect_name(sys.argv)
    corrFileName = fn.create_correlations_name(sys.argv)

    if not os.path.exists(corrFileName):
        tokenVects = read.word_vects(vectorFileName)
        if tokenVects is None:
            print("Specified vector file not found.")
            print("To create vectors use 'createProfVectors.py'")
            exit()
        ratingVect = read.overall_rating_vect(vectorFileName)
        vocabVect = read.vocab_from_vect_file(vectorFileName)
        corrTups = stat.find_correlations(tokenVects, ratingVect, vocabVect)
        write.token_correlations(corrTups, corrFileName)
    else:
        corrTups = read.token_correlations(corrFileName)

    corrPlotFileName = None
    if '-save' in sys.argv:
        corrPlotFileName = fn.create_correlations_plot_name(sys.argv)

    # Plot correlations
    plot.tuple_pair_score_correlation(
        corrTups,
        title=plot.create_token_pair_score_correlation_name(sys.argv),
        saveFile=corrPlotFileName)
示例#2
0
def create_prof_vectors(tokenSchema, argv, profDicts=None, profTokenDict=None):
    """ Create token count vectors for the aggrigate reviews of each 
       professor.
   """

    if profDicts is None:
        profDicts = read.prof_dicts()

    if profTokenDict is None:
        ptdName = fn.create_prof_token_dict_name(argv)
        profTokenDict = read.prof_token_dicts(ptdName)

    schemaDict = value_idx_dict(tokenSchema)

    profVects = []
    pidsNotIncluded = []
    for prof in profDicts:
        newVect = create_prof_vector(
            prof, count.combine_rev_counters(profTokenDict[prof['pid']]),
            schemaDict)
        if newVect['token_vect'] is None:
            pidsNotIncluded.append(newVect['pid'])
        else:
            profVects.append(newVect)

    pidsNotIncluded.sort()

    return profVects, pidsNotIncluded
def main():

    if '-h' in sys.argv:
        print_usage_message()
        exit()

    tokenSchema = vp.create_token_schema(sys.argv)
    profVects, pidsNotIncl = vp.create_prof_vectors(tokenSchema, sys.argv)
    profVectFileName = fn.create_prof_vect_name(sys.argv, True)
    write.prof_vects(profVects, pidsNotIncl, tokenSchema, profVectFileName)
示例#4
0
def process_token_vectors(vects, argv):
    if '-tf' in argv:
        vects = np.apply_along_axis(to_tf_vect, 1, vects)
    elif '-tfidf' in argv:
        vocab = read.vocab_from_vect_file(fn.create_prof_vect_name(argv))
        idfVect = create_idf_vect(vocab, vects.shape[0], argv)
        print(idfVect.shape, vects.shape)
        vects = np.apply_along_axis(lambda x: to_tf_idf_vect(x, idfVect), 1,
                                    vects)
    return vects
示例#5
0
def main():

    if '-h' in sys.argv:
        print_usage_message()
        exit()

    stmr = None
    stopwords = None
    if '-ss' in sys.argv:
        stmr = LancasterStemmer()
        stopwords = read.stopwords(stmr)

    countNames = fn.create_token_count_names(sys.argv)
    rawTokenCountName = countNames[0]
    revTokenCountName = countNames[1]
    profTokenCountName = countNames[2]

    rawTokens = read.token_count(rawTokenCountName, True)
    revTokens = read.token_count(revTokenCountName, True)
    profTokens = read.token_count(profTokenCountName, True)

    if rawTokens == None or revTokens == None or profTokens == None:
        profTokenDict = grab_prof_token_dict(stopwords, stmr)

        if rawTokens == None:
            rawTokens = grab_token_count(profTokenDict, count.num_tokens,
                                         rawTokenCountName)
        if revTokens == None:
            revTokens = grab_token_count(profTokenDict,
                                         count.num_reviews_with_token,
                                         revTokenCountName)

        if profTokens == None:
            profTokens = grab_token_count(profTokenDict,
                                          count.num_profs_with_token,
                                          profTokenCountName)

    plotName = create_plot_name()
    plotFileName = None
    if '-save' in sys.argv:
        plotFileName = fn.create_count_plot_name(sys.argv)

    plot.token_counts(rawTokens, revTokens, profTokens, plotFileName, plotName)
示例#6
0
def token_schema_from_count(argv):
    countsFileName = fn.create_token_count_names(argv)
    countsFileName = countsFileName[1]  # Num revs token appears in
    tokenCounts = read.token_count(countsFileName)
    if tokenCounts is None:
        print("Token count file not found.")
        print("Create token count file using 'countTokens.py'")
        exit()

    minCount = int(argv[argv.index('-minCount') + 1])
    tokenSchema = [t for t, c in tokenCounts.items() if c >= minCount]
    tokenSchema.sort()
    return tokenSchema
示例#7
0
def create_idf_vect(vocab, numProfs, argv):
    """ vocab is expected to be a python list """

    countFileName = fn.create_token_count_names(argv)
    countFileName = countFileName[2]

    tokCounts = read.token_count(countFileName)

    countVect = np.zeros(len(vocab), dtype=float)

    for idx, word in enumerate(vocab):
        countVect[idx] = tokCounts[word]

    return np.log(numProfs / countVect)
示例#8
0
def create_rev_vectors(tokenSchema, argv, profDicts=None, profTokenDict=None):

    if profDicts is None:
        profDicts = read.prof_dicts()

    if profTokenDict is None:
        ptdName = fn.create_prof_token_dict_name(argv)
        profTokenDict = read.prof_token_dicts(ptdName)

    schemaDict = value_idx_dict(tokenSchema)

    revVects = []
    for prof in profDicts:
        for rev in prof['reviews']:
            revVects.append(create_rev_vector(rev, schemaDict))
示例#9
0
def token_schema_from_correlations(argv):
    corrFileName = fn.create_correlations_name(argv)
    corrTups = read.token_correlations(corrFileName)
    if corrTups is None:
        print("Correlations file not found")
        print("Create correlations file with 'findCorrelations.py'")
        exit()

    corIdx = argv.index('-corr')
    minCount = int(argv[corIdx + 1])
    minScore = float(argv[corIdx + 2])

    reducedTups = [(cor[0], cor[1]) for cor in corrTups
                   if cor[2] >= minCount and cor[3] >= abs(minScore)]

    tokenSet = set()
    for tok1, tok2 in reducedTups:
        tokenSet.add(tok1)
        tokenSet.add(tok2)

    tokenSchema = list(tokenSet)
    tokenSchema.sort()
    return tokenSchema
示例#10
0
def grab_prof_token_dict(stopwords, stmr):

    filename = fn.create_prof_token_dict_name(sys.argv)

    if os.path.exists(filename):
        with open(filename, 'rb') as f:
            profTokenDict = pickle.load(f)
        return profTokenDict

    token_f = lambda t: count.create_single_tokens(t, stopwords, stmr)
    if '-tup' in sys.argv:
        token_f = lambda t: count.create_tuple_tokens(t, stopwords, stmr)
    elif '-stup' in sys.argv:
        token_f = (
            lambda t: count.create_single_tuple_tokens(t, stopwords, stmr))

    profs = read.prof_dicts()

    profTokenDict = count.create_prof_token_dict(profs, token_f)

    with open(filename, 'wb') as f:
        pickle.dump(profTokenDict, f)

    return profTokenDict
示例#11
0
def main():

    if '-h' in sys.argv:
        print_usage_message()
        exit()

    vectFileName = fn.create_prof_vect_name(sys.argv, True)
    simMatFileName = fn.create_sim_mat_name(sys.argv)
    predsFileName = fn.create_preds_name(sys.argv)

    print(vectFileName)
    print(simMatFileName)
    print(predsFileName)

    # Grab the ratings vector
    if '-d' in sys.argv:
        ratings = read.difficulty_rating_vect(vectFileName)
    else:
        ratings = read.overall_rating_vect(vectFileName)

    # Assign similarity metric
    sim_f = vp.inverse_euclidean_distance
    if '-cos' in sys.argv:
        sim_f = vp.cosine_similarity
    elif '-pear' in sys.argv:
        sim_f = vp.abs_pearson_correlation

    # Set if weighted or not
    weighted = True
    if '-unweighted' in sys.argv:
        weighted = False

    # Grab predictions or create them if not available
    predictions = read.knn_predictions(predsFileName)
    if predictions is None:

        simMat = read.similarity_matrix(simMatFileName)
        if simMat is None:
            wordVects = read.word_vects(vectFileName)
            if wordVects is None:
                print("Vector file " + vectFileName + " does not exist")
                exit()
            wordVects = vp.process_token_vectors(wordVects, sys.argv)
            simMat = knn.get_similarity_matrix(wordVects, sim_f)
            write.similarity_matrix(simMat, simMatFileName)

        predictions = knn.knn_dataset(ratings, MaxK, simMat, weighted)
        write.knn_predictions(predictions, predsFileName)

    idxToPlot = None

    if '-maxK' in sys.argv:
        maxK = int(sys.argv[sys.argv.index('-maxK') + 1])
        predictions = predictions[:, :maxK]

    pidVect = read.pid_vect(vectFileName)
    singleRevIdxs = vp.pids_to_idxs(pidVect,
                                    read.pids_file(fn.PidsSingleRevFile))
    smallLenIdxs = vp.pids_to_idxs(pidVect,
                                   read.pids_file(fn.PidsSmallRevLenFile))

    plotFileName = None
    if '-save' in sys.argv:
        plotFileName = fn.create_knn_accuracy_plot_name(sys.argv)

    # Output results of the run
    plot.knn_error(
        predictions,
        ratings,
        title=plot.create_knn_error_title(sys.argv),
        idxToPlot=[singleRevIdxs, smallLenIdxs],
        subTitles=[
            "Error with profs with one review",
            "Error with profs with aggrigate review " +
            "lengths one std div above the mean " + "review length or less"
        ],
        saveFile=plotFileName)
示例#12
0
def main():

    if '-h' in sys.argv:
        print_usage_message()
        exit()

    vectorFileName = fn.create_prof_vect_name(sys.argv, True)
    tokenVects = read.word_vects(vectorFileName)
    if tokenVects is None:
        print("Could not find token vects")
        print("Use 'createProfVectors.py' to create vectors")
        exit()

    tokenVects = vp.process_token_vectors(tokenVects, sys.argv)

    if '-d' in sys.argv:
        ratings = read.difficulty_rating_vect(vectorFileName)
    else:
        ratings = read.overall_rating_vect(vectorFileName)

    # Create Training and validation sets
    pidVect = read.pid_vect(vectorFileName)

    nonSingleSmallIdxs = ffnn.non_single_small_idxs(pidVect)
    singleIdxs = vp.pids_to_idxs(pidVect, read.pids_file(fn.PidsSingleRevFile))
    smallIdxs = vp.pids_to_idxs(pidVect,
                                read.pids_file(fn.PidsSmallRevLenFile))
    singleSmallIdxs = list(set(singleIdxs).union(set(smallIdxs)))
    singleSmallIdxs.sort()
    singleSmallIdxs = np.array(singleSmallIdxs)

    trainingVects = tokenVects[nonSingleSmallIdxs, :]
    trainingRatings = ratings[nonSingleSmallIdxs]

    validVects = tokenVects[singleSmallIdxs, :]
    validRatings = ratings[singleSmallIdxs]

    print(trainingVects.shape, trainingRatings.shape, validVects.shape,
          validRatings.shape)
    """
  
   xTrain, xValid, yTrain, yValid = train_test_split(tokenVects, ratings,
                                                      test_size=0.3)
   """
    # Select and train model
    if '-deep' in sys.argv:
        model = ffnn.deep_model(tokenVects.shape[1])
    else:
        model = ffnn.shallow_model(tokenVects.shape[1])

    history = model.fit(trainingVects,
                        trainingRatings,
                        epochs=10,
                        batch_size=5,
                        validation_data=(validVects, validRatings))

    plotTitle = plot.ffnn_error_title(sys.argv)
    outfile = None
    if '-save' in sys.argv:
        outfile = fn.create_ffnn_plot_name(sys.argv)

    plot.ffnn_error(history, title=plotTitle, filename=outfile)