示例#1
0
def POSTag(data):
    for q, cl in data:
        q_w = preprocessor(q[1])
        addToCache(q[0], q_w)
        for c in cl:
            c_w = preprocessor(c[1])
            addToCache(c[0], c_w)
示例#2
0
def predict(doc2vec, data, output, mlp=None):
    """ Answer Reranking with rank ~ cosine(q_i, a_i)^(-1) """
    # data : zip(questions, commentsL) ... see 'constructData'
    out = open(output, 'w')
    for q, cl in data:
        scores = []
        q_w = preprocessor(q[1])
        q_v = doc2vec.infer_vector(q_w)
        ac_v = getAverageCV(doc2vec, cl)
        for j, c in enumerate(cl):
            c_w = preprocessor(c[1])
            c_v = doc2vec.infer_vector(c_w)
            f_v = getFeatures(doc2vec, q_w, c_w, \
                { 'qid' : q[0], 'cid' : c[0], 'rank' : j })
            f_v.extend(
                [cosine(q_v, c_v),
                 cosine(q_v, ac_v),
                 cosine(c_v, ac_v)])
            score, pred = predictAux(q_v, c_v, ac_v, f_v, mlp)
            scores.append([score, j, 0, pred])
        scores = sorted(scores, key=lambda score: score[0], reverse=True)
        for i in range(len(scores)):
            scores[i][2] = i + 1
        scores = sorted(scores, key=lambda score: score[1])
        for score in scores:
            out.write('\t'.join([
                q[0], cl[score[1]][0],
                str(score[2]),
                str(score[0]), score[3]
            ]))
            out.write('\n')
    out.close()
示例#3
0
 def to_array(self):
     self.sentences = []
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 self.sentences.append(
                     LabeledSentence(preprocessor(line),
                                     [prefix + '_%s' % item_no]))
     return self.sentences
示例#4
0
def trainNN(doc2vec, data):
    """ Train MLP """
    mlp = MLPClassifier( solver = param['solver'], \
        hidden_layer_sizes = param['hidden'], \
        activation = param['activation'], \
        learning_rate = 'adaptive', \
        early_stopping = False, \
        random_state = 1, \
        max_iter = 1000, \
        verbose = True )
    X = []
    Y = []
    if data is not None:
        for q, cl in data:
            q_w = preprocessor(q[1])
            q_v = doc2vec.infer_vector(q_w)
            q_v /= norm(q_v)
            ac_v = getAverageCV(doc2vec, cl)
            for j, c in enumerate(cl):
                c_w = preprocessor(c[1])
                c_v = doc2vec.infer_vector(c_w)
                c_v /= norm(c_v)
                f_v = getFeatures(doc2vec, q_w, c_w, \
                    { 'qid' : q[0], 'cid' : c[0], 'rank' : j })
                f_v.extend(
                    [cosine(q_v, c_v),
                     cosine(q_v, ac_v),
                     cosine(c_v, ac_v)])
                X.append(np.append(np.append(q_v, c_v), np.append(ac_v, f_v)))
                Y.append(transformLabel(c[2]))
        np.savez('out/trainNN.npz', x=X, y=Y)
    else:
        npzfile = np.load('out/trainNN.npz')
        X = npzfile['x']
        Y = npzfile['y']
    mlp.fit(X, Y)
    return mlp
示例#5
0
def getAverageCV(doc2vec, cl):
    """ get (norm-ed) average comment vector """
    ac_v = None
    for c in cl:
        c_w = preprocessor(c[1])
        c_v = doc2vec.infer_vector(c_w)
        c_v /= norm(c_v)
        if ac_v is None:
            ac_v = [0] * len(c_v)
        for i in range(len(c_v)):
            ac_v[i] += c_v[i]
    if ac_v is not None:
        ac_v = [float(x) / len(cl) for x in ac_v]
        ac_v /= norm(ac_v)
    return ac_v if ac_v is not None else []
示例#6
0
# pre-processing utilities
from myutils import preprocessor, constructData, debug

config = json.load(open('config.json', 'r'))

dataPath = config['TRAIN_NN']['path']
fileList = config['TRAIN_NN']['files']
data = constructData(dataPath, fileList)

debug('====== CONSTRUCTING DOCS AND TEXTS ======')
docs = []
for q, cl in data:
    docs.append(q[1])
    for c in cl:
        docs.append(c[1])
texts = [preprocessor(d) for d in docs]

debug('====== CONSTRUCTING DICTIONARY ======')
dictionary = corpora.Dictionary(texts)
dictionary.save('models/lda/semeval.dict')

debug('====== CONSTRUCTING CORPUS ======')
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('models/lda/semeval.mm', corpus)

debug('====== CONSTRUCTING LDA MODEL ======')
lda = LdaModel(corpus, num_topics=100)
lda.save('models/lda/semeval.lda')

debug('====== FINISHED ======')
示例#7
0
""" Loading trained Doc2Vec model """
windowsize = int(sys.argv[1])
dimension = int(sys.argv[4])
nepoch = int(sys.argv[2])
mode = sys.argv[3]
name_tuple = ( data_prefix.strip('DATA').lower(), windowsize, nepoch )
model = Doc2Vec.load('./models/' + mode + '/' + str(dimension) + 'd' + '/semeval-%s-lc-ns-%dw-%de.d2v' % name_tuple)

nsamp = 0
sqerr = 0.0
nsqerr = 0.0
sentences = []
with utils.smart_open(data_path) as fin:
    for item_no, line in enumerate(fin):
        sentences.append(line)
        words = preprocessor(line)
        model_v = model.docvecs[ data_prefix + '_%s' % item_no ]
        infer_v = model.infer_vector(words)
        sim = dot(model_v, infer_v)
        sqerr += ( ( 1 - sim ) * ( 1 - sim ) )
        model_v /= norm(model_v)
        infer_v /= norm(infer_v)
        sim = dot(model_v, infer_v)
        nsqerr += ( ( 1 - sim ) * ( 1 - sim ) )
        nsamp += 1

rsqerr = 0.0
rnsqerr = 0.0
shuffle(sentences)
for item_no in range(nsamp):
    words = preprocessor(sentences[item_no])
示例#8
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 yield LabeledSentence(preprocessor(line),
                                       [prefix + '_%s' % item_no])