예제 #1
0
    def predict_batch(self, batch, top_n=1):
        """Predict a batch of document - question pairs."""
        documents, questions, candidates = [], [], []
        for b in batch:
            documents.append(b[0])
            questions.append(b[1])
            candidates.append(b[2] if len(b) == 3 else None)
        candidates = candidates if any(candidates) else None

        # Tokenize the inputs, perhaps multi-processed.
        if self.workers:
            q_tokens = self.workers.map_async(tokenize, questions)
            c_tokens = self.workers.map_async(tokenize, documents)
            q_tokens = list(q_tokens.get())
            c_tokens = list(c_tokens.get())
        else:
            q_tokens = list(map(self.tokenizer.tokenize, questions))
            c_tokens = list(map(self.tokenizer.tokenize, documents))

        examples = []
        for i in range(len(questions)):
            examples.append({
                'id': i,
                'question': q_tokens[i].words(),
                'question_char': q_tokens[i].chars(),
                'qlemma': q_tokens[i].lemmas(),
                'qpos': q_tokens[i].pos(),
                'qner': q_tokens[i].entities(),
                'document': c_tokens[i].words(),
                'document_char': c_tokens[i].chars(),
                'clemma': c_tokens[i].lemmas(),
                'cpos': c_tokens[i].pos(),
                'cner': c_tokens[i].entities(),
            })

        # Stick document tokens in candidates for decoding
        if candidates:
            candidates = [{
                'input': c_tokens[i],
                'cands': candidates[i]
            } for i in range(len(candidates))]

        # Build the batch and run it through the model
        batch_exs = batchify([vectorize(e, self.model) for e in examples])
        s, e, score = self.model.predict(batch_exs, candidates, top_n)

        # Retrieve the predicted spans
        results = []
        for i in range(len(s)):
            predictions = []
            for j in range(len(s[i])):
                span = c_tokens[i].slice(s[i][j], e[i][j] + 1).untokenize()
                predictions.append((span, score[i][j]))
            results.append(predictions)
        return results
예제 #2
0
 def __getitem__(self, index):
     return vectorize(self.examples[index], self.model)
예제 #3
0
 def __getitem__(self, index):
     return vectorize(self.examples[index], self.model, self.single_answer)
예제 #4
0
import pandas as pd
import vector as v
import preprocessing as p
import cluster2 as c
import classifier as r
a = pd.read_csv("Z:/TermPaper/twitter_cred-master/data.csv")
print("cleaning....")
doc, id1 = p.clean(a)
print("vectorizing....")
dvec, global_vector = v.vectorize(doc)
print("clustering....")
g, t = c.cluster(dvec, global_vector, id1)
cnt = 0
x = []
print(len(t))
print("credibility calculating")
r.classifier(g)
from vector import get_word_features, vectorize, get_words, naive_bayes_vector
import cPickle
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from maxent import Maxent
import numpy
from naive_bayes import NaiveBayesClassifier
from svm_classifier import SVM

data = get_data('shortdatabase.csv')
word_features = get_word_features(data['tweet'])
word_features = sorted(word_features)
word_features = sorted(word_features)
word_vector = vectorize(word_features, data['tweet'], data['sentiment'])

vector = []
labels = []
for example in word_vector:
    vector = vector + [example[0]]
    labels = labels + [example[1]]
print "Stage 1: Word Polarity"
print "training bayesian network"

words = get_words("features.txt")
bayes_vector = naive_bayes_vector(words, data['tweet'], data['sentiment'])
#print bayes_vector
NaiveBayesClassifier.train(bayes_vector)

#gnb =  BernoulliNB()