content = '\n'.join(content) setx.append(nf.featurize(cid, name=name, title=title, content=content, link=link, name_update=False)) sety.append(0) except Exception, e: print 'nid', nid, e print 'failed', nf.failed print 'size', len(setx), len(sety) setx = vec.fit_transform(setx).toarray() clf = LogisticRegression() print 'cross', cross_validation.cross_val_score(clf, setx, sety, cv=5) clf.fit(setx, sety) print clf._get_param_names() print clf.decision_function(setx) joblib.dump(clf, os.path.join(os.path.split(os.path.realpath(__file__))[0], 'dumps/news.score.0824.lrmodel')) joblib.dump(vec, os.path.join(os.path.split(os.path.realpath(__file__))[0], 'dumps/news.0824.featurizer')) def test(): nf = NewsFeatures() db = dbcon.connect_torndb() mongo = dbcon.connect_mongo() vec = DictVectorizer() random_source = [item[0] for item in nf.source.items() if item[1] <= 20] setx, sety = [], [] # print 'init'