def analyse(factory): data = iter_corpus() predictor = factory() predictor.fit(data) p1 = ExtractText() X1 = p1.transform(data) p2 = EncodingText(predictor.vocabulary) p2.fit(X1) X = p2.transform(X1) y = [a.rating for a in data] (v1, v2, score, words) = predictor.classifier.analyse(X, predictor.vocabulary) labels = ["neg", "pos", "mix", "other"] counter = [] for i in range(len(set(y))): counter.append([0.0] * len(predictor.vocabulary)) for i in range(np.size(X, 0)): x = X[i] label = y[i] for w in x: counter[label - 1][w] += 1.0 counter = np.array(counter) cl = [] for i in range(len(predictor.vocabulary)): cl_max = max(counter[:, i]) for j in range(len(set(y))): if counter[j, i] == cl_max: cl.append(j) visualise(v1, predictor.vocabulary, cl) visualise(v2, predictor.vocabulary, cl) for i in range(len(score)): print "sentiment - " + str(labels[i]) for j in range(len(score[i])): print words[i][j] + " : " + str(score[i][j])
def analyse(factory): data=iter_corpus() predictor=factory() predictor.fit(data) p1=ExtractText(True) X1=p1.transform(data) p2=EncodingText(predictor.vocabulary) p2.fit(X1) X=p2.transform(X1) y=[a.rating for a in data] (v1,v2,score,words)=predictor.classifier.analyse(X,predictor.vocabulary) labels=['neg','pos','mix','other'] counter=[] for i in range(len(set(y))): counter.append([0.0]*len(predictor.vocabulary)) for i in range(np.size(X,0)): x=X[i] label=y[i] for w in x: counter[label-1][w]+=1.0 counter=np.array(counter) cl=[] for i in range(len(predictor.vocabulary)): cl_max=max(counter[:,i]) for j in range(len(set(y))): if counter[j,i]==cl_max: cl.append(j) visualise(v1,predictor.vocabulary,cl) visualise(v2,predictor.vocabulary,cl) for i in range(len(score)): print 'sentiment - '+str(labels[i]) for j in range(len(score[i])): print words[i][j]+' : '+str(score[i][j])
value = float(value) except ValueError: pass new[key] = value return new if __name__ == "__main__": import argparse import json import csv import sys from corpus import iter_corpus, iter_test_corpus from predictor import PhraseSentimentPredictor parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("filename") config = parser.parse_args() config = json.load(open(config.filename)) start=time.time() predictor = PhraseSentimentPredictor(**config) predictor.fit(list(iter_corpus())) print "fitting takes "+str(time.time()-start) test = list(iter_test_corpus()) #prediction = predictor.predict(test) score = predictor.score(test,'test') print("test score {}%".format(score * 100)) print 'programme finished!'
""" Created on Mon Sep 7 15:36:55 2015 @author: VAIO """ from collections import defaultdict from sklearn.pipeline import make_pipeline, make_union from corpus import iter_corpus, iter_test_corpus from transformations import (ExtractText, ExtractAuthor,ExtractDate,EncodingText) import csv from settings import DATA_PATH def target(phrases): return [datapoint.rating for datapoint in phrases] phrases = list(iter_corpus()) vocabulary=[] with open(DATA_PATH + '/vocabulary',encoding='utf-8') as f: rd=csv.reader(f) for line in rd: vocabulary.append(line[0]) pipeline1 = [ExtractText()] pipeline1.append(EncodingText(vocabulary)) pipeline=make_pipeline(*pipeline1) y = target(phrases)
self.last = new if __name__ == "__main__": import argparse import json from evaluation import analyse from predictor import PhraseSentimentPredictor # get vocabulary from corpus import iter_corpus import csv,os from transformations import ExtractText if not os.path.exists('./data/vocabulary'): datapoints=list(iter_corpus()) vocabulary=set() et=ExtractText() X=et.transform(datapoints) for datap in X: for w in datap.split(): vocabulary.add(w) vocabulary=list(vocabulary) vocabulary.sort() with open('./data/vocabulary','wb') as f: wr=csv.writer(f) for voc in vocabulary: wr.writerow([voc]) parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("filename")
self.last = new if __name__ == "__main__": import argparse import json from evaluation import analyse from predictor import PhraseSentimentPredictor # get vocabulary from corpus import iter_corpus import csv, os from transformations import ExtractText if not os.path.exists('./data/vocabulary'): datapoints = list(iter_corpus()) vocabulary = set() et = ExtractText() X = et.transform(datapoints) for datap in X: for w in datap.split(): vocabulary.add(w.lower()) vocabulary = list(vocabulary) vocabulary.sort() with open('./data/vocabulary', 'wb') as f: wr = csv.writer(f) for voc in vocabulary: wr.writerow([voc]) parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("filename")