def POSTag(data): for q, cl in data: q_w = preprocessor(q[1]) addToCache(q[0], q_w) for c in cl: c_w = preprocessor(c[1]) addToCache(c[0], c_w)
def predict(doc2vec, data, output, mlp=None): """ Answer Reranking with rank ~ cosine(q_i, a_i)^(-1) """ # data : zip(questions, commentsL) ... see 'constructData' out = open(output, 'w') for q, cl in data: scores = [] q_w = preprocessor(q[1]) q_v = doc2vec.infer_vector(q_w) ac_v = getAverageCV(doc2vec, cl) for j, c in enumerate(cl): c_w = preprocessor(c[1]) c_v = doc2vec.infer_vector(c_w) f_v = getFeatures(doc2vec, q_w, c_w, \ { 'qid' : q[0], 'cid' : c[0], 'rank' : j }) f_v.extend( [cosine(q_v, c_v), cosine(q_v, ac_v), cosine(c_v, ac_v)]) score, pred = predictAux(q_v, c_v, ac_v, f_v, mlp) scores.append([score, j, 0, pred]) scores = sorted(scores, key=lambda score: score[0], reverse=True) for i in range(len(scores)): scores[i][2] = i + 1 scores = sorted(scores, key=lambda score: score[1]) for score in scores: out.write('\t'.join([ q[0], cl[score[1]][0], str(score[2]), str(score[0]), score[3] ])) out.write('\n') out.close()
def to_array(self): self.sentences = [] for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append( LabeledSentence(preprocessor(line), [prefix + '_%s' % item_no])) return self.sentences
def trainNN(doc2vec, data): """ Train MLP """ mlp = MLPClassifier( solver = param['solver'], \ hidden_layer_sizes = param['hidden'], \ activation = param['activation'], \ learning_rate = 'adaptive', \ early_stopping = False, \ random_state = 1, \ max_iter = 1000, \ verbose = True ) X = [] Y = [] if data is not None: for q, cl in data: q_w = preprocessor(q[1]) q_v = doc2vec.infer_vector(q_w) q_v /= norm(q_v) ac_v = getAverageCV(doc2vec, cl) for j, c in enumerate(cl): c_w = preprocessor(c[1]) c_v = doc2vec.infer_vector(c_w) c_v /= norm(c_v) f_v = getFeatures(doc2vec, q_w, c_w, \ { 'qid' : q[0], 'cid' : c[0], 'rank' : j }) f_v.extend( [cosine(q_v, c_v), cosine(q_v, ac_v), cosine(c_v, ac_v)]) X.append(np.append(np.append(q_v, c_v), np.append(ac_v, f_v))) Y.append(transformLabel(c[2])) np.savez('out/trainNN.npz', x=X, y=Y) else: npzfile = np.load('out/trainNN.npz') X = npzfile['x'] Y = npzfile['y'] mlp.fit(X, Y) return mlp
def getAverageCV(doc2vec, cl): """ get (norm-ed) average comment vector """ ac_v = None for c in cl: c_w = preprocessor(c[1]) c_v = doc2vec.infer_vector(c_w) c_v /= norm(c_v) if ac_v is None: ac_v = [0] * len(c_v) for i in range(len(c_v)): ac_v[i] += c_v[i] if ac_v is not None: ac_v = [float(x) / len(cl) for x in ac_v] ac_v /= norm(ac_v) return ac_v if ac_v is not None else []
# pre-processing utilities from myutils import preprocessor, constructData, debug config = json.load(open('config.json', 'r')) dataPath = config['TRAIN_NN']['path'] fileList = config['TRAIN_NN']['files'] data = constructData(dataPath, fileList) debug('====== CONSTRUCTING DOCS AND TEXTS ======') docs = [] for q, cl in data: docs.append(q[1]) for c in cl: docs.append(c[1]) texts = [preprocessor(d) for d in docs] debug('====== CONSTRUCTING DICTIONARY ======') dictionary = corpora.Dictionary(texts) dictionary.save('models/lda/semeval.dict') debug('====== CONSTRUCTING CORPUS ======') corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('models/lda/semeval.mm', corpus) debug('====== CONSTRUCTING LDA MODEL ======') lda = LdaModel(corpus, num_topics=100) lda.save('models/lda/semeval.lda') debug('====== FINISHED ======')
""" Loading trained Doc2Vec model """ windowsize = int(sys.argv[1]) dimension = int(sys.argv[4]) nepoch = int(sys.argv[2]) mode = sys.argv[3] name_tuple = ( data_prefix.strip('DATA').lower(), windowsize, nepoch ) model = Doc2Vec.load('./models/' + mode + '/' + str(dimension) + 'd' + '/semeval-%s-lc-ns-%dw-%de.d2v' % name_tuple) nsamp = 0 sqerr = 0.0 nsqerr = 0.0 sentences = [] with utils.smart_open(data_path) as fin: for item_no, line in enumerate(fin): sentences.append(line) words = preprocessor(line) model_v = model.docvecs[ data_prefix + '_%s' % item_no ] infer_v = model.infer_vector(words) sim = dot(model_v, infer_v) sqerr += ( ( 1 - sim ) * ( 1 - sim ) ) model_v /= norm(model_v) infer_v /= norm(infer_v) sim = dot(model_v, infer_v) nsqerr += ( ( 1 - sim ) * ( 1 - sim ) ) nsamp += 1 rsqerr = 0.0 rnsqerr = 0.0 shuffle(sentences) for item_no in range(nsamp): words = preprocessor(sentences[item_no])
def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield LabeledSentence(preprocessor(line), [prefix + '_%s' % item_no])