예제 #1
0
def prepare_data():
    logger.debug('Fetching unlabeled posts from database')
    con = sqlite3.connect(conf.CORPUSDB)
    sql = '''
        SELECT Headline, Body
        FROM Posts
        WHERE ID_Post NOT IN (
            SELECT DISTINCT ID_Post
            FROM Annotations
        )
    '''
    r = con.execute(sql)
    pool = multiprocessing.Pool()
    posts = pool.map(preprocess, r)
    return posts
예제 #2
0
def get_post_documents():
    logger.debug('Fetching unlabeled posts from database')
    con = sqlite3.connect(conf.CORPUSDB)
    sql = '''
        SELECT ID_Post, COALESCE(Headline, '') || ' ' || COALESCE(Body, '')
        FROM Posts
        WHERE ID_Post NOT IN (
            SELECT DISTINCT ID_Post
            FROM Annotations
        )
    '''
    r = con.execute(sql)
    pool = multiprocessing.Pool()
    while True:
        rows = r.fetchmany(100000)
        if len(rows) == 0:
            break
        logger.debug('Normalizing and tokenizing')
        wordlists = pool.map(micro_tokenize,
                             pool.map(normalize, [r[1] for r in rows]))
        for i, words in enumerate(wordlists):
            yield TaggedDocument(words, [rows[i][0]])
    pool.close()
    pool.join()
    logger.debug('End of generator')
from gensim.models.word2vec import Word2Vec
import numpy
from sklearn.cluster import KMeans

from customlogging import logger

import conf

if __name__ == '__main__':
    w2vmodelfile = os.path.join(conf.W2V_DIR, 'model')
    if not os.path.exists(w2vmodelfile):
        print('Word2vec model file "%s" not found.' % w2vmodelfile)
        print('Did you run train_word2vec.py?')
        sys.exit(1)

    logger.debug('Loading word embedding')
    emb = Word2Vec.load(w2vmodelfile)
    vocab = emb.index2word
    wordvecs = emb.wv.syn0

    # add UNK word at origin of embedding space
    vocab.append('UNK')
    wordvecs = numpy.vstack((wordvecs, numpy.zeros(wordvecs.shape[1])))

    clusterer = KMeans(n_clusters=conf.BOCID_NCLUSTERS,
                       random_state=conf.SEED,
                       max_iter=conf.BOCID_CLUSTITER,
                       n_jobs=-1)
    logger.debug('Starting clustering')
    VC = clusterer.fit_predict(wordvecs)
    logger.debug('Matching words to cluster IDs')
예제 #4
0
             int(y_pred[i])])
    return resultrows


if __name__ == '__main__':
    if not os.path.exists(conf.RESULTDB):
        con_results = sqlite3.connect(conf.RESULTDB)
        con_results.execute(conf.RESULTDB_SETUP)
    else:
        con_results = sqlite3.connect(conf.RESULTDB)

    cats = get_categories()
    folds = get_folds()
    sql = 'INSERT INTO Results VALUES(?, ?, ?, ?, ?, ?)'
    for method in methodmodules.keys():
        logger.debug('-' * 40)
        logger.debug('Method %s', method)
        logger.debug('Computing results for %d categories and %d folds...' %
                     (len(cats), len(folds)))

        jobs = []
        for c in cats:
            for fold in folds:
                jobs.append([method, c, fold])

        # LSTM runs on GPU, where all memory is needed for a single job. Hence,
        # we need to run each job sequentially.
        if method == 'LSTM':
            results = list(itertools.starmap(evaluate, jobs))
        # For all other methods, we can spawn parallel processes.
        else:
예제 #5
0
            SELECT DISTINCT ID_Post
            FROM Annotations
        )
    '''
    r = con.execute(sql)
    pool = multiprocessing.Pool()
    posts = pool.map(preprocess, r)
    return posts


if __name__ == '__main__':
    if not os.path.exists(conf.W2V_DIR):
        os.mkdir(conf.W2V_DIR)
    sentences = prepare_data()

    logger.debug('word2vec training...')
    logging.basicConfig(format='%(asctime)s [word2vec]: %(message)s',
                        level=logging.INFO)
    model = word2vec.Word2Vec(sentences,
                              size=conf.W2V_DIMS,
                              window=5,
                              min_count=5,
                              seed=conf.SEED,
                              workers=1,
                              iter=conf.W2V_EPOCHS)
    model.delete_temporary_training_data(
        replace_word_vectors_with_normalized=True)
    outfile = os.path.join(conf.W2V_DIR, 'model')
    logger.debug('Storing word2vec object to "%s"' % outfile)
    model.save(fname_or_handle=outfile, separately=None, pickle_protocol=3)
    logger.debug('Finished.')
예제 #6
0
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test):
    pool = multiprocessing.Pool()
    wordlists_train = pool.map(preprocess, txt_train)
    wordlists_test = pool.map(preprocess, txt_test)
    pool.close()
    pool.join()

    emb = Word2Vec.load(os.path.join(conf.W2V_DIR, 'model'))
    # add point at orign for unknown words
    emb.wv.syn0 = numpy.vstack((emb.wv.syn0,
        numpy.zeros(emb.wv.syn0.shape[1], dtype=numpy.float32)))

    # train data: replace words with embedding IDs, zero-padding and truncation
    X = numpy.zeros((len(y_train), conf.LSTM_MAXPOSTLEN), dtype=numpy.int32)
    X_lengths = numpy.zeros((len(y_train)))
    for i, words in enumerate(wordlists_train):
        X_lengths[i] = len(words)
        for j, w in enumerate(words):
            if j >= conf.LSTM_MAXPOSTLEN:
                break
            if w in emb:
                X[i,j] = emb.vocab[w].index
            else:
                X[i,j] = len(emb.vocab)

    # test data: replace words with embedding IDs, zero-padding and truncation
    test_X = numpy.zeros((len(y_test), conf.LSTM_MAXPOSTLEN), dtype=numpy.int32)
    test_lengths = numpy.zeros((len(y_test)))
    for i, words in enumerate(wordlists_test):
        test_lengths[i] = len(words)
        for j, w in enumerate(words):
            if j >= conf.LSTM_MAXPOSTLEN:
                break
            if w in emb:
                test_X[i,j] = emb.vocab[w].index
            else:
                test_X[i,j] = len(emb.vocab)

    # one-hot encode y
    enc = OneHotEncoder()
    y = enc.fit_transform(y_train.reshape(-1,1)).todense()
    test_y = enc.transform(y_test.reshape(-1,1)).todense()

    # split training data 80/20 into training and validation data for early
    # stopping
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2,
        random_state=conf.SEED)
    train_i, vali_i = next(splitter.split(X, y_train))
    X_vali = X[vali_i,:]
    y_vali = y[vali_i,:]
    vali_lengths = X_lengths[vali_i]
    X = X[train_i,:]
    y = y[train_i,:]
    X_lengths = X_lengths[train_i]

    numpy.random.seed(conf.SEED)
    tf.set_random_seed(conf.SEED)
    model = LSTMModel(emb, y.shape[1])

    # The following, in combination with
    #   export CUDA_VISIBLE_DEVICES=""
    # in the shell disables all parallelism, which leads to reproducible results
    # but takes a very long time to complete
    # sess = tf.Session(config=tf.ConfigProto(
        # inter_op_parallelism_threads=1
        # intra_op_parallelism_threads=1))

    sess = tf.Session()

    sess.run(model.init_op)
    no_of_batches = math.ceil(len(X) / conf.LSTM_BATCHSIZE)
    losses = []
    f1s_train = []
    precisions_vali = []
    recalls_vali = []
    f1s_vali = []
    precisions_test = []
    recalls_test = []
    f1s_test = []
    best_vali_f1 = -1.0
    best_y_pred = []
    for i in range(conf.LSTM_EPOCHS):
        ptr = 0
        totalloss = 0.0
        predictions = []
        true = []
        batch_gen = stratified_batch_generator(X, y, X_lengths,
            conf.LSTM_BATCHSIZE)
        for inp, out, leng in batch_gen:
            extra = conf.LSTM_BATCHSIZE - len(inp)
            if extra > 0:
                inp = numpy.vstack((inp, numpy.zeros((extra, inp.shape[1]))))
                out = numpy.vstack((out, numpy.zeros((extra, out.shape[1]))))
                leng = numpy.concatenate((leng, numpy.zeros(extra)))
            _, loss, pred = sess.run(
                [
                    model.minimize,
                    model.cross_entropy,
                    model.prediction
                ],
                {
                    model.data: inp,
                    model.target: out,
                    model.lengths: leng,
                    model.dropout_lstm: conf.LSTM_DROPOUT_LSTM,
                    model.dropout_fully: conf.LSTM_DROPOUT_FULLY,
                }
            )
            pred = list(numpy.argmax(pred, axis=1))
            true.extend(out)
            if extra > 0:
                pred = pred[:-extra]
                true = true[:-extra]
            predictions.extend(pred)
            totalloss += loss
        losses.append(totalloss)
        true = numpy.argmax(true, axis=1)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=UndefinedMetricWarning)
            f1s_train.append(f1_score(predictions, true))


        # validation set F1
        predictions = []
        ptr2 = 0
        for j in range(math.ceil(len(X_vali) / conf.LSTM_BATCHSIZE)):
            inp2 = X_vali[ptr2:ptr2+conf.LSTM_BATCHSIZE]
            leng = vali_lengths[ptr2:ptr2+conf.LSTM_BATCHSIZE]
            extra = conf.LSTM_BATCHSIZE - len(inp2)
            if extra > 0:
                inp2 = numpy.vstack((inp2, numpy.zeros((extra, inp2.shape[1]))))
                leng = numpy.concatenate((leng, numpy.zeros(extra)))

            ptr2 += conf.LSTM_BATCHSIZE
            pred = sess.run(model.prediction,
                {
                    model.data: inp2,
                    model.lengths: leng,
                    model.dropout_lstm: 1.0,
                    model.dropout_fully: 1.0,
                }
            )
            pred = list(numpy.argmax(pred, axis=1))
            if extra > 0:
                pred = pred[:-extra]
            predictions.extend(pred)
        true = numpy.argmax(y_vali, axis=1)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=UndefinedMetricWarning)
            precisions_vali.append(precision_score(predictions, true))
            recalls_vali.append(recall_score(predictions, true))
            f1s_vali.append(f1_score(predictions, true))


        # test set F1
        predictions = []
        ptr2 = 0
        for j in range(math.ceil(len(test_X) / conf.LSTM_BATCHSIZE)):
            inp2 = test_X[ptr2:ptr2+conf.LSTM_BATCHSIZE]
            leng = test_lengths[ptr2:ptr2+conf.LSTM_BATCHSIZE]
            extra = conf.LSTM_BATCHSIZE - len(inp2)
            if extra > 0:
                inp2 = numpy.vstack((inp2, numpy.zeros((extra, inp2.shape[1]))))
                leng = numpy.concatenate((leng, numpy.zeros(extra)))

            ptr2 += conf.LSTM_BATCHSIZE
            pred = sess.run(model.prediction,
                {
                    model.data: inp2,
                    model.lengths: leng,
                    model.dropout_lstm: 1.0,
                    model.dropout_fully: 1.0,
                }
            )
            pred = list(numpy.argmax(pred, axis=1))
            if extra > 0:
                pred = pred[:-extra]
            predictions.extend(pred)
        true = numpy.argmax(test_y, axis=1)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=UndefinedMetricWarning)
            precisions_test.append(precision_score(predictions, true))
            recalls_test.append(recall_score(predictions, true))
            f1s_test.append(f1_score(predictions, true))

        # "early stopping" (not really stopping)
        if f1s_vali[-1] > best_vali_f1:
            best_y_pred = predictions
            best_vali_f1 = f1s_vali[-1]
            logger.debug('New best Validation F1: %f', best_vali_f1)

        logger.debug('Epoch %3d of %3d, total loss = %.4f, ' +
            'F1_train = %.4f, F1_test = %.4f',
            i + 1, conf.LSTM_EPOCHS, totalloss, f1s_train[-1], f1s_test[-1])
        if not os.path.exists(conf.LSTM_PLOTDIR):
            os.mkdir(conf.LSTM_PLOTDIR)
        plotfile = os.path.join(conf.LSTM_PLOTDIR,
            'plot_%s_%d.png' % (cat, fold))
        plot_losses_f1s(
            losses, f1s_train,
            precisions_vali, recalls_vali, f1s_vali,
            precisions_test, recalls_test, f1s_test,
            plotfile
        )

    sess.close()
    del model
    tf.reset_default_graph()

    return best_y_pred
예제 #7
0
    logger.debug('End of generator')


if __name__ == '__main__':
    logging.basicConfig(format='%(asctime)s [doc2vec] : %(message)s',
                        level=logging.INFO)

    d2v = Doc2Vec(dm=1,
                  size=conf.D2V_DIMS,
                  negative=5,
                  iter=1,
                  alpha=conf.D2V_ALPHA,
                  seed=conf.SEED,
                  workers=1)

    logger.debug('Building doc2vec vocabulary...')
    d2v.build_vocab(get_post_documents())

    logger.debug('doc2vec training...')
    alpha = conf.D2V_ALPHA
    alpha_delta = (conf.D2V_ALPHA - conf.D2V_MINALPHA) / conf.D2V_EPOCHS
    for i in range(conf.D2V_EPOCHS):
        logger.debug('Epoch %d of %d (alpha = %f)', i + 1, conf.D2V_EPOCHS,
                     alpha)
        d2v.alpha = alpha
        d2v.train(get_post_documents(), report_delay=10.0)
        alpha -= alpha_delta

    if not os.path.exists(conf.D2V_DIR):
        os.mkdir(conf.D2V_DIR)
    outfile = os.path.join(conf.D2V_DIR, 'model')