Exemplo n.º 1
0
def generateTask3QuestionData(hashList):
	questions = getQuestions(hashList)
	generateTokens(questions)
	return questions
Exemplo n.º 2
0
from nltk.corpus import stopwords
import logging
import os
import sys

sys.path.insert(0, os.path.abspath('..'))
from utils.QuestionFileCreator import CreateFilePath, getQuestions, QuestionCleaner, initializeLog
from utils.sourceFiles import thisList

initializeLog()

new_dest = CreateFilePath('LsiModel')

stops = set(stopwords.words('english'))

questions = QuestionCleaner(getQuestions(thisList))

dictionary = corpora.Dictionary(line['question'].lower().split()
                                for line in questions)
# remove stopwords
stop_ids = [
    dictionary.token2id[stopword] for stopword in stops
    if stopword in dictionary.token2id
]
# remove words only appearing once
once_ids = [
    tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1
]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
dictionary.save(new_dest + '.dict')
def prepLabeledSentList(questions=[], withStops=False):
    mod_questions = []
    for q in questions:
        #print('idx: ' + str(idx) + ' , question: ' + question)
        mod_questions.append(
            TaggedDocument([i for i in q['question'] if i not in stops],
                           q['id']))
    return mod_questions


def altDoc2Vec(questions):
    mod_questions = prepLabeledSentList(questions)
    model = Doc2Vec(min_count=1,
                    window=10,
                    size=100,
                    sample=1e-4,
                    negative=5,
                    workers=8)
    model.build_vocab(mod_questions)
    shuffle(mod_questions)
    for epoch in range(10):
        model.train(mod_questions)
    return model


questions = getQuestions(thisList)
model = altDoc2Vec(questions)

createPredictionFile(origQfilePath, model)