def test_json2(self): self.jsonpath = "test/test.json" self.json = json.load(open(self.jsonpath, "r")) fv = [] for d in self.json: fe = DocumentFeatures(d, parse=False) fe.pipeline() logging.debug(pformat(fe.features)) fv.append(fe.features) raise Exception
def test_json(self): self.json = json.load(open(self.jsonpath, "r")) fv = [] for d in self.json[0:5]: fe = DocumentFeatures(d, parse=True) fe.pipeline() logging.debug(pformat(fe.features)) fv.append(fe.features) # vec = DictVectorizer(sparse=True) # array_f = vec.fit_transform(fv).toarray() # logging.debug(pformat(array_f)) raise Exception
def process_each_cat(cat=None, docs=[]): """ Parameters ---------- cat: integer low=0, mid=1, high=2 docs: list [[`doc1`],...] Returns ------- _f: list of dictionaries feature vectors _s: list of integers scores """ assignment = [20, 50, 90] _f = [] _s = [] _p = [] widgets = [ 'Extracting features... for category {}: done for '.format(cat), progressbar.Counter(), ' doc(s), (', progressbar.Timer(), ')' ] pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(docs)).start() for n, d in enumerate(docs): fe = DocumentFeatures(d, parse=False) fe.pipeline() _f.append(fe.features) _p.append(fe.doc) b = assignment[cat] # s = randrange(b-10, b+10)/100.0 s = b / 100.0 _s.append(s) pbar.update(n + 1) assert len(_f) == len(_s) # import json # json.dump(_p, open("../parsed/{}.json".format(cat), "w")) pbar.finish() return _f, _s
def get_score(doc=[], modelpath=None): ''' Parameters ---------- doc: a list of lists document as a list (currently assuming this is not yet parsed) e.g.: [ ["The", "cat", "sat", "on", "the", "mat", "."], ["Colorless", "green", "ideas", "sleep", "furiously", "."] ] Returns ------- score: float, ranges 0 to 1 regression score ''' fe = DocumentFeatures(doc) _f = fe.pipeline() model = SklearnClassifier() model.load_model(modelpath) model.load_fmap(modelpath) f = model.transform(_f) score = model.predict(f) return int(score*100)
def process_each_cat(cat=None, docs=[]): """ Parameters ---------- cat: integer low=0, mid=1, high=2 docs: list [[`doc1`],...] Returns ------- _f: list of dictionaries feature vectors _s: list of integers scores """ assignment = [20, 50, 90] _f = [] _s = [] _p = [] widgets = ['Extracting features... for category {}: done for '.format(cat), progressbar.Counter(), ' doc(s), (', progressbar.Timer(), ')'] pbar = progressbar.ProgressBar(widgets=widgets, maxval=len(docs)).start() for n, d in enumerate(docs): fe = DocumentFeatures(d, parse=False) fe.pipeline() _f.append(fe.features) _p.append(fe.doc) b = assignment[cat] # s = randrange(b-10, b+10)/100.0 s = b/100.0 _s.append(s) pbar.update(n+1) assert len(_f)==len(_s) # import json # json.dump(_p, open("../parsed/{}.json".format(cat), "w")) pbar.finish() return _f, _s