Exemplo n.º 1
0
import datetime
import json
import pprint
import copy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import random
import gensim
import jieba
import jieba.posseg as pseg
from re import compile as _Re
from gensim.models.word2vec import Word2Vec
from gensim.summarization import keywords

from Segmentor import *
segmenter = Segmentor()
tagger = POSTagger()
import re

from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': '192.168.2.10', 'port': 9200}])


def content_segmentor(article):
    article = ""
    sentences = Tokenizer.ToSents(article)
    for sent in sentences:
        # 在斷詞
        words = segmenter.segment(sent)
        if words != []:
            article += ' '.join(words)
Exemplo n.º 2
0
VALIDATION_SIZE = 1000
TRAIN_SIZE = 5172

X = data['training_data']
Y = data['training_labels'].T.ravel()
randomIndex = np.random.choice(TRAIN_SIZE, TRAIN_SIZE, replace=False)

# Split Data
xTrain = X[randomIndex[:-VALIDATION_SIZE]]
yTrain = Y[randomIndex[:-VALIDATION_SIZE]]
xValidate = X[randomIndex[-VALIDATION_SIZE:]]
yValidate = Y[randomIndex[-VALIDATION_SIZE:]]
xTest = data['test_data']

segmentor = Segmentor()

print "============= Decision Tree =========="
tree = DTree(Impurity.impurity, segmentor, depth=20)
tree.train(xTrain, yTrain)
labels = tree.predict(xValidate)

counts = np.bincount(tree.predict(xTrain) == yTrain)
error = 1.0 - (counts[True] / float(counts[True] + counts[False]))
print "Training Error: %f" % (error)

counts = np.bincount(labels == yValidate)
error = 1.0 - (counts[True] / float(counts[True] + counts[False]))
print "Validation Error: %f" % (error)

#import pdb; pdb.set_trace()