def prediction(fileName=None): #fileName = "training_set.tsv" #fileName = "validation_set.tsv" if fileName is None: fileName = os.path.basename(sys.argv[1]) #Read Data predictionFile = pd.read_table(dataDirectory + fileName, sep='\t') sampleSubmission = pd.read_table(dataDirectory + "sample_submission.csv", sep=',') #Files compartmentalized questions = predictionFile["question"] answersA = predictionFile["answerA"] answersB = predictionFile["answerB"] answersC = predictionFile["answerC"] answersD = predictionFile["answerD"] if predictionFile.shape[1] == 7: #Validation answers correctAnswers = predictionFile["correctAnswer"] correctAnswersSentences = [predictionFile["answer" + str(answer)] for answer in correctAnswers][0] stopWords = list(set(get_stop_words('english') + stopwords.words('english') + [""])) #Remove necesary stopwords necesaryStopwords = ["itself", "all"] for nStop in necesaryStopwords: stopWords.remove(nStop) # Whoosh search # whooshIndex = open_dir("index2") # searchResults = questionsWhooshSearch(ix=whooshIndex, questions=questions, answersA=answersA, # answersB=answersB, answersC=answersC, answersD=answersD, # stopWordsList=stopWords) ##Whoosh Query #ix = open_dir("index2") #with ix.searcher() as searcher: # query = QueryParser("content", ix.schema).parse(u'drinking water ver good nice') # results = searcher.search(query) # print results[0] #Load picked files corpus #file = open("trainingQuestionsNumeric.pkl", 'r') #questions = pickle.load(file) #file.close() #Load the word2vec model wikiWord2Vec = Word2Vec.load_word2vec_format("word2Vec.model.bin", binary=True) print "word2Vec model loaded" #Corpus vocabulary Convert it to a set, for speed. word2vec index2word_set = set(wikiWord2Vec.index2word) trainingQuestionsNumeric = questions2NumericArray(questionsVector=questions, embeddingModel=wikiWord2Vec, modelVocab=index2word_set, stopWords4Fun=stopWords) if predictionFile.shape[1] == 7: #Dot product of the word vectors taken from (https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-3-more-fun-with-word-vectors) # Index2word is a list that contains the names of the words in the model's vocabulary. Convert it to a set, for speed results = vectorAveraging(word2VecModel=wikiWord2Vec, questionsArray=trainingQuestionsNumeric, answersA=answersA, answersB=answersB, answersC=answersC, answersD=answersD, modelsWordSet=index2word_set, correctLetter=correctAnswers, correctAnswersSentences=correctAnswersSentences, transformationFun=characterList2Numvec, stopWordsList=stopWords) #Accuracy on training set print "accuracy on training set is " + str(float(sum(results[:, 5])) / float(trainingQuestionsNumeric.shape[0])) #Save .csv for meta-analysis np.savetxt("validationArray.csv", results, delimiter=",") else: #Dot product of the word vectors taken from (https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-3-more-fun-with-word-vectors) # Index2word is a list that contains the names of the words in the model's vocabulary. Convert it to a set, for speed results = vectorAveraging(word2VecModel=wikiWord2Vec, questionsArray=trainingQuestionsNumeric, answersA=answersA, answersB=answersB, answersC=answersC, answersD=answersD, modelsWordSet=index2word_set, stopWordsList=stopWords, transformationFun=characterList2Numvec) #Remove NaN from arrays valResults = np.nan_to_num(results) predictedAnswerVal = [] possibleAnswers = ["A", "B", "C", "D"] for idx in range(len(valResults)): #idx = 0 #questionIdx = 2 questionIdx = list(valResults[idx] == max(valResults[idx])) predictedAnswerVal.append(possibleAnswers[questionIdx.index(True)]) #Write a submission file sampleSubmission["correctAnswer"] = predictedAnswerVal sampleSubmission.to_csv(submissionDirectory + "test.csv", sep=",", index=False) #Return results for further analysis return results
# wikiDoc2Vec.train(wikipedia8thGradeTaggedDocument) # wikiDoc2Vec.alpha -= 0.002 # decrease the learning rate # wikiDoc2Vec.min_alpha = wikiDoc2Vec.alpha # fix the learning rate, no decay # print "epoch number " + str(epoch + 1) + " finished" #print "training DONE!" #Corpus vocabulary Convert it to a set, for speed. word2vec index2word_set = set(wikiWord2Vec.index2word) #Corpus vocabulary Convert it to a set, for speed, Doc2Vec #index2Doc2VecWord_set = set(wikiDoc2Vec.index2word) #Dot product of the word vectors taken from (https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-3-more-fun-with-word-vectors) # Index2word is a list that contains the names of the words in the model's vocabulary. Convert it to a set, for speed trainingResults = vectorAveraging(word2VecModel=wikiWord2Vec, questions=trainQuestions, answersA=answersA, answersB=answersB, answersC=answersC, answersD=answersD, modelsWordSet=index2word_set, correctLetter=correctAnswers, correctAnswersSentences=correctAnswersSentences, transformationFun=characterList2Numvec, stopWordsList=stopWords) #Accuracy on training set print "accuracy on training set is " + str(float(sum(trainingResults[:, 5])) / float(len(trainQuestions))) #Doc2Vec native distance # Index2word is a list that contains the names of the words in the model's vocabulary. Convert it to a set, for speed #trainingResultsDoc2Vec = vectorAveraging(word2VecModel=wikiDoc2Vec, questions=trainQuestions, answersA=answersA, # answersB=answersB, answersC=answersC, answersD=answersD, # modelsWordSet=index2Doc2VecWord_set, correctLetter=correctAnswers, # correctAnswersSentences=correctAnswersSentences, transformationFun=list2List, # stopWordsList=stopWords) #Accuracy on training set