示例#1
0
def main(argv):
    # Check if the appropriate amount of arguments were passed

    if len(argv) != 1:
        print("Usage: python3 main.py folder_with_datasets")
        exit(-1)

    print("Opening Datasets:")
    data = []

    # Read and convert training data
    try:
        for x in os.listdir(argv[0]):
            print(x)
            if argv[0][-1] == "/":
                data.append([x, loadData(argv[0] + x)])
            else:
                data.append([x, loadData(argv[0] + '/' + x)])

    except FileNotFoundError:
        print("That is not a valid directory")
        exit(-1)

    # Train SVM
    print("\nTraining SVM")
    trained = tools_svm.train_svm(data)

    # Pickle it for later
    with open("trained.obj", "wb") as pickle_file:
        pickle.dump(trained, pickle_file)

    print("\nSVM Stored")
示例#2
0
def processFile(file):
    file_name = file.split('/')[-1].split('.')[0]

    df = loader.loadData(file)

    df = preProcessing.targetToNum(df)
    df = preProcessing.createLabel(df)
    df_text = df[['AwardedAmountToDate', 'Abstract']]
    df_num = df.drop('Abstract', axis=1)

    df_text = preProcessing.htmlTagRemover(df_text)
    df_text = preProcessing.characterRemover(df_text)
    df_text = preProcessing.tokenizer(df_text)
    df_text = preProcessing.stemAndLemma(df_text)
    df_text = preProcessing.stopwordsRemover(df_text)
    #    df_text_untagged = df_text.drop('AwardedAmountToDate', axis=1)

    df_num = preProcessing.nonPredictiveFeatureRemover(df_num)
    df_num = preProcessing.processDateFeatures(df_num)
    df_num = preProcessing.processCategoricalFeatures(df_num)
    #    df_num_untagged = df_num.drop('AwardedAmountToDate', axis=1)

    text_file_name = file_name + '_text.pkl'
    num_file_name = file_name + '_num.pkl'
    with open(addresses['processed'] + text_file_name, 'wb') as f:
        pickle.dump(df_text, f)
    with open(addresses['processed'] + num_file_name, 'wb') as f:
        pickle.dump(df_num, f)
示例#3
0
def analyzeBestFitUser():
    """
    对测试集中最佳预测的用户进行深入的探究
    """
    movies, movieTagMat, userRankMat, testCases = loadData()
    user2userPredictor = user2user(userRankMat, topK=105)
    item2itemPredictor = item2item(userRankMat, movieTagMat, topK=20)

    # do test
    # _, results = predictTest(user2userPredictor, testCases, "")
    # _, results = predictTest(item2itemPredictor, testCases, "")
    # userAvgSSE = defaultdict(float)
    # for res in results:
    #     userAvgSSE[res[0]] += (res[2] - res[1]) ** 2
    # sse = list(userAvgSSE.items())
    # sse.sort(key=lambda x: x[1])
    # # best-fit user
    # uid, minSSE = sse[0]
    # print("(uid, smallest SSE): ({}, {})".format(uid, minSSE))
    uid = 480
    # do recommend
    # 使用不同的推荐系统进行结果对比
    # recommender = Recommender(movieTagMat, userRankMat, movies, user2userPredictor)
    recommender = Recommender(movieTagMat, userRankMat, movies,
                              item2itemPredictor)
    recommendMovies = recommender.doRecommend(uid, 50)["recommended_movies"]
    print("recommended movies:")
    recommendedCategory = defaultdict(int)
    for m, r in recommendMovies.items():
        for genre in movies[r[0]].genres:
            recommendedCategory[genre] += 1
    for k, v in sorted(recommendedCategory.items(),
                       key=lambda d: d[0],
                       reverse=True):
        print("{}: {}".format(k, v))
    print("")
    # compare
    print("His or her favorite movies:")
    userRank = userRankMat[uid]
    idx = np.argsort(-userRank)[:50]
    userLikeCategory = defaultdict(int)
    for i in idx:
        for genre in movies[i].genres:
            userLikeCategory[genre] += 1
    for k, v in sorted(userLikeCategory.items(),
                       key=lambda d: d[0],
                       reverse=True):
        print("{}: {}".format(k, v))

    print("")
    for k, v in recommendedCategory.items():
        if k in userLikeCategory:
            print("{},{},{}".format(k, v, userLikeCategory[k]))
        else:
            print("{},{},0".format(k, v))
    for k, v in userLikeCategory.items():
        if k not in recommendedCategory:
            print("{},0,{}".format(k, v))
示例#4
0
def drawTopK_u2u() -> None:
    _, _, userRankMat, testSet = loadData()
    topKLst = list(range(1, 335))
    sseLst = []
    for topK in topKLst:
        sse, _ = predictTest(user2user(userRankMat, topK), testSet, "")
        sseLst.append(sse)
    draw(topKLst, sseLst, "Top K", "SSE", "参数Top K与平方误差和的关系折线图(1,335)", 1, 1)
    draw(topKLst[50:], sseLst[50:], "Top K", "SSE",
         "参数Top K与平方误差和的关系折线图(50,335)", 2, 1)
示例#5
0
def drawTopK_i2i() -> None:
    _, movieTagMat, userRankMat, testSet = loadData()
    # topKLst = list(range(1, 101))
    topKLst = list(range(1, 1301, 20))
    sseLst = []
    for topK in topKLst:
        sse, _ = predictTest(item2item(userRankMat, movieTagMat, topK=topK),
                             testSet, "")
        print(sse)
        sseLst.append(sse)
    draw(topKLst, sseLst, "Top K", "SSE", "参数Top K与平方误差和的关系折线图(1,1300)", 1, 1)
示例#6
0
def drawHashNumber_i2i() -> None:
    _, movieTagMat, userRankMat, testSet = loadData()
    sseLst = []
    for hashFuncNum in range(1, 21):
        sse, _ = predictTest(
            item2item(userRankMat,
                      movieTagMat,
                      topK=20000,
                      minHashParas=(hashFuncNum, 0, 2**32 - 1, 4294967311)),
            testSet, "")
        print(sse)
        sseLst.append(sse)
    draw(list(range(1, 21)), sseLst, "Number of hash functions", "SSE",
         "hash函数数量与平方误差和的关系折线图", 1, 10)
示例#7
0
def drawHashNumber_u2u() -> None:
    _, _, userRankMat, testSet = loadData()
    threshold = 2.5
    hashFuncNumber = range(100, 2001, 50)
    sseLst = []
    for hashFuncNum in hashFuncNumber:
        sse, _ = predictTest(
            user2user(userRankMat,
                      topK=105,
                      threshold=threshold,
                      minHashParas=(hashFuncNum, 0, 2**32 - 1, 4294967311)),
            testSet, "")
        print(sse)
        sseLst.append(sse)
    draw(hashFuncNumber, sseLst, "Number of hash functions", "SSE",
         "hash函数数量与平方误差和的关系折线图", 1, 10)
示例#8
0
def doRecommender() -> None:
    """
    对所有用户进行推荐,输出到recommend文件夹中,结果以json文件的格式保存
    """
    begin = time()
    movies, movieTagMat, userRankMat, _ = loadData()
    predictor = item2item(userRankMat, movieTagMat, topK=100)
    recommender = Recommender(movieTagMat, userRankMat, movies, predictor)
    resFilePrefix: str = "./recommend/user"
    for i in range(userRankMat.shape[0]):
        with open(resFilePrefix + str(i + 1) + ".json", "w") as f:
            f.write(
                json.dumps(recommender.doRecommend(i, 50),
                           indent=4,
                           separators=(',', ':')))
    end = time()
    print("total time usage: {}".format(end - begin))
示例#9
0
import loader
#from collections import Counter
#import model
#import pregex as pre

data, group_idxs, test_data = loader.loadData("./data/csv.p",
                                              n_examples=1000,
                                              n_tasks=50,
                                              max_length=15)
#M = loader.load('./models/task38.pt')
#net = M['net']
#trace = M['trace']
#concepts = trace.baseConcepts

#r = pre.create("(NA)|(NA)")
#print(trace.model.scoreregex(r, trace))
# for concept in concepts:
# 	print(str(concept))
# 	# c = Counter(concept.sample(trace) for _ in range(1000))
# 	# samples = sorted(c, key=c.get, reverse=True)
# 	# print(samples)
# 	# print()

for i in range(len(test_data)):
    print(i, list(set(test_data[i]))[:5])

print(len(data), "train +", len(test_data), "test =",
      len(data) + len(test_data), "total")
示例#10
0
import argparse

import loader

parser = argparse.ArgumentParser()
parser.add_argument('--data_file', type=str, default="./data/csv.p")
parser.add_argument('--n_tasks', type=int, default=40) #Per max_length
parser.add_argument('--n_examples', type=int, default=500)
parser.add_argument('--max_length', type=int, default=15) #maximum length of inputs or targets
args = parser.parse_args()
print("Loading data...")
data, group_idxs, test_data = loader.loadData(args.data_file, args.n_examples, args.n_tasks, args.max_length)

print("\nTraining Data:")
for X in data:
	print(X[:5])

print("\nTest Data:")
for X in test_data:
	print(X[:5])
示例#11
0
import loader
from propose import Proposal, evalProposal, getProposals, networkCache
import util

import torch

import os
import math
import argparse
import pregex as pre
from trace import RegexWrapper

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default=max(('results/%s'%x for x in os.listdir('results') if x[-3:]==".pt"), key=os.path.getmtime)) #Most recent model
args = parser.parse_args()

print("Loading", args.model)
M = loader.load(args.model)

if 'net' in M and M['net'] is not None:
	if torch.cuda.is_available(): M['net'].cuda()
	net = M['net']

data, group_idxs, test_data = loader.loadData(M['args'].data_file, M['args'].n_examples, M['args'].n_tasks, M['args'].max_length)

trace = M['trace']
model = trace.model
示例#12
0
def load_data():
    matrix, y_vector = loader.loadData("../smartphone.txt")
    matrix = loader.tune_matrix(matrix)
    y_vector = np.transpose(y_vector)
    return matrix, y_vector
示例#13
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 24 10:59:50 2019

@author: xinning.w
"""
import loader
import preProcessing

PATH_TO_DATA = '/Users/mengjie/Documents/Courses/SpringII/6450NLP/Awards_data/'
df = loader.loadData(PATH_TO_DATA)

df = preProcessing.htmlTagRemover(df)
df = preProcessing.characterRemover(df)
df = preProcessing.tokenizer(df)
df = preProcessing.stemAndLemma(df)
df = preProcessing.stopwordsRemover(df)
df = preProcessing.extractVectorMatrix(df)
df = preProcessing.nonPredictiveFeatureRemover(df)
df = preProcessing.processDateFeatures(df)
df = preProcessing.processCategoricalFeatures(df)

df.to_csv('processed.csv', index=False)

示例#14
0
# IO
name='.'.join(sys.argv[1].split(os.sep)[-1].split('.')[:-1]) if not param.has_key('name') else param['name']
err_output_folder=param['errOutputFolder']
model_saved_folder=param['modelSavedFolder']
gradient_saved_folder=param['gradientSavedFolder'] if param.has_key('gradientSavedFolder') else None
out_file=param['outFile']

dictionary=loader.loadDict(dict_file)
dictionary[-1]=np.random.randn(vector_dim)*0.5
if not os.path.exists(model_saved_folder):
    os.makedirs(model_saved_folder)
if gradient_saved_folder!=None:
    if not os.path.exists(gradient_saved_folder):
        os.makedirs(gradient_saved_folder)

train_index=loader.loadData(trainXFile)
train_label=loader.loadData(trainYFile)
if train_only==False:
    test_index=loader.loadData(testXFile)
    test_label=loader.loadData(testYFile)

rnn=RNNs.RNNs(neurons=neurons,nonlinearity=nonlinearity)
if model2load!=None:
    print 'load weights from file: %s'%model2load
    rnn.load(model2load,testOnly=True)

results='''mode=%s,U_lr=%s,W_lr=%s,V_lr=%s,s_lr=%s,config file=%s\n'''%(mode,
    str(learn_rate['U']),str(learn_rate['W']),str(learn_rate['V']),str(learn_rate['s']),sys.argv[1])
print results,

#Preprocess the data
loss = lasagne.objectives.aggregate(
    lasagne.objectives.categorical_crossentropy(
        lasagne.layers.get_output(output, x), y), mode = 'mean')
updates = lasagne.updates.adagrad(loss, dcnnParams, learning_rate = 0.1)

# ACCURACY FOR PREDICTIONS
prediction = T.argmax(lasagne.layers.get_output(output, x, deterministic=True), axis=1)
score = T.eq(prediction, y).mean()

# SYMBOLIC FUNCTIONS
trainDCNN = theano.function([x,y], outputs = loss, updates = updates)
validateDCNN = theano.function([x,y], outputs = score)
testDCNN = theano.function([x,y], outputs = score)

# LOAD THE DATA
trainingSentences = loader.loadData('myDataset/train.txt')
trainingLabels = loader.loadData('myDataset/train_label.txt')
validationSentences = loader.loadData('myDataset/dev.txt')
validationLabels = loader.loadData('myDataset/dev_label.txt')
testSentences = loader.loadData('myDataset/test.txt')
testLabels = loader.loadData('myDataset/test_label.txt')

# TRAIN THE MODEL
print '...training the DCNN'
for epoch in range(NUMOFEPOCHS):
    for i in xrange(len(trainingSentences)):
        trainDCNN(np.asarray(trainingSentences[i:i+1], dtype = np.int32), 
            np.asarray(trainingLabels[i], dtype = np.int32))
        print 'Sentence ', i, ' complete.'

# SAVE THE TRAINED MODEL