Exemplo n.º 1
0
def constructMetaData(dataPath, fileList):
    for xmlFile in fileList:
        debug(dataPath + xmlFile)
        doc = minidom.parse(dataPath + xmlFile)
        threads = doc.getElementsByTagName("Thread")
        for thread in threads:
            relQ = thread.getElementsByTagName('RelQuestion')[0]
            Qid = relQ.getAttribute('RELQ_ID')
            meta_cache[Qid] = {
                'author': relQ.getAttribute('RELQ_USERID'),
                'category': relQ.getAttribute('RELQ_CATEGORY'),
                'time': relQ.getAttribute('RELQ_DATE')
            }
            if meta_cache[Qid]['category'] not in unique_cats:
                unique_cats.append(meta_cache[Qid]['category'])
            user_tracker = {}
            for relC in thread.getElementsByTagName('RelComment'):
                Cid = relC.getAttribute('RELC_ID')
                meta_cache[Cid] = {
                    'author': relC.getAttribute('RELC_USERID'),
                    'time': relC.getAttribute('RELC_DATE')
                }
                if meta_cache[Cid]['author'] not in user_tracker:
                    user_tracker[meta_cache[Cid]['author']] = 0
                user_tracker[meta_cache[Cid]['author']] += 1
                meta_cache[Cid]['comment#'] = user_tracker[meta_cache[Cid]
                                                           ['author']]
            for relC in thread.getElementsByTagName('RelComment'):
                Cid = relC.getAttribute('RELC_ID')
                meta_cache[Cid]['#comment'] = user_tracker[meta_cache[Cid]
                                                           ['author']]
Exemplo n.º 2
0
from myutils import debug

# gensim modules
from gensim.models import Doc2Vec

# KMeans clustering
from sklearn.cluster import KMeans

# Model persistence
from sklearn.externals import joblib

config = json.load(open('config.json', 'r'))

cluster_cache = {}

debug('====== IMPORTING DOC2VEC MODEL ======')
modelPath = config['DOC2VEC']['full']['path']
modelName = config['DOC2VEC']['full']['name']
doc2vec = Doc2Vec.load(modelPath + modelName)

debug('====== CONSTRUCTING DATA POINTS ======')
vocab = doc2vec.vocab.keys()
X = np.array([doc2vec[w] for w in vocab])
X.dtype = np.float64

debug('====== RUNNING KMEANS ======')
kmeans = KMeans(n_clusters=1000).fit(X)
joblib.dump(kmeans, 'models/cluster/kmeans.pkl')

debug('====== SAVING RESULTS ======')
for i, w in enumerate(vocab):
Exemplo n.º 3
0
    global tagger, tagger_cache
    if tagger_cache.get(id) is not None:
        return
    tags = tagger.tag(wl)
    findUniqueTags(tags)
    tagger_cache[id] = tagsToString(tags)

def POSTag(data):
    for q, cl in data:
        q_w = preprocessor(q[1])
        addToCache(q[0], q_w)
        for c in cl:
            c_w = preprocessor(c[1])
            addToCache(c[0], c_w)

debug('======= TRAIN DATA =======')
dataPath = config['TRAIN_NN']['path']
fileList = config['TRAIN_NN']['files']
data = constructData(dataPath, fileList)
POSTag(data)

debug('======= TEST DATA \'16 =======')
dataPath = config['TEST_NN']['path']
fileList = config['TEST_NN']['2016']['files']
data = constructData(dataPath, fileList)
POSTag(data)

debug('======= TEST DATA \'17 =======')
dataPath = config['TEST_NN']['path']
fileList = config['TEST_NN']['2017']['files']
data = constructData(dataPath, fileList)
Exemplo n.º 4
0
# gensim corpus lib
from gensim import corpora

# LDA topic modeling lib
from gensim.models.ldamodel import LdaModel

# pre-processing utilities
from myutils import preprocessor, constructData, debug

config = json.load(open('config.json', 'r'))

dataPath = config['TRAIN_NN']['path']
fileList = config['TRAIN_NN']['files']
data = constructData(dataPath, fileList)

debug('====== CONSTRUCTING DOCS AND TEXTS ======')
docs = []
for q, cl in data:
    docs.append(q[1])
    for c in cl:
        docs.append(c[1])
texts = [preprocessor(d) for d in docs]

debug('====== CONSTRUCTING DICTIONARY ======')
dictionary = corpora.Dictionary(texts)
dictionary.save('models/lda/semeval.dict')

debug('====== CONSTRUCTING CORPUS ======')
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('models/lda/semeval.mm', corpus)
Exemplo n.º 5
0
                meta_cache[Cid] = {
                    'author': relC.getAttribute('RELC_USERID'),
                    'time': relC.getAttribute('RELC_DATE')
                }
                if meta_cache[Cid]['author'] not in user_tracker:
                    user_tracker[meta_cache[Cid]['author']] = 0
                user_tracker[meta_cache[Cid]['author']] += 1
                meta_cache[Cid]['comment#'] = user_tracker[meta_cache[Cid]
                                                           ['author']]
            for relC in thread.getElementsByTagName('RelComment'):
                Cid = relC.getAttribute('RELC_ID')
                meta_cache[Cid]['#comment'] = user_tracker[meta_cache[Cid]
                                                           ['author']]


debug('======= TRAIN DATA =======')
dataPath = config['TRAIN_NN']['path']
fileList = config['TRAIN_NN']['files']
constructMetaData(dataPath, fileList)

debug('======= TEST DATA \'16 =======')
dataPath = config['TEST_NN']['path']
fileList = config['TEST_NN']['2016']['files']
constructMetaData(dataPath, fileList)

debug('======= TEST DATA \'17 =======')
dataPath = config['TEST_NN']['path']
fileList = config['TEST_NN']['2017']['files']
constructMetaData(dataPath, fileList)

json.dump(meta_cache, open('meta_cache.json', 'w'))
Exemplo n.º 6
0
        for i in range(len(scores)):
            scores[i][2] = i + 1
        scores = sorted(scores, key=lambda score: score[1])
        for score in scores:
            out.write('\t'.join([
                q[0], cl[score[1]][0],
                str(score[2]),
                str(score[0]), score[3]
            ]))
            out.write('\n')
    out.close()


if __name__ == '__main__':
    populateParam()
    debug('== IMPORT DOC2VEC MODEL ==')
    doc2vec = loadDoc2Vec('full')
    """ TRAIN MODE """
    debug('======= TRAIN MODE =======')
    dataPath = config['TRAIN_NN']['path']
    fileList = config['TRAIN_NN']['files']
    data = constructData(dataPath, fileList) \
        if not os.path.isfile('out/trainNN.npz') else None
    mlp = trainNN(doc2vec, data)
    """ VALIDATION MODE """
    debug('======= VALIDATION =======')
    dataPath = config['VALIDATION']['path']
    fileList = config['VALIDATION']['files']
    data = constructData(dataPath, fileList)
    output = dataPath + config['VALIDATION']['predictions']
    predict(doc2vec, data, output, mlp)