Exemplo n.º 1
0
                    content = '\n'.join(content)
                    setx.append(nf.featurize(cid, name=name, title=title, content=content, link=link, name_update=False))
                    sety.append(0)
                except Exception, e:
                    print 'nid', nid, e

    print 'failed', nf.failed
    print 'size', len(setx), len(sety)

    setx = vec.fit_transform(setx).toarray()
    clf = LogisticRegression()

    print 'cross', cross_validation.cross_val_score(clf, setx, sety, cv=5)

    clf.fit(setx, sety)
    print clf._get_param_names()
    print clf.decision_function(setx)
    joblib.dump(clf, os.path.join(os.path.split(os.path.realpath(__file__))[0], 'dumps/news.score.0824.lrmodel'))
    joblib.dump(vec, os.path.join(os.path.split(os.path.realpath(__file__))[0], 'dumps/news.0824.featurizer'))


def test():

    nf = NewsFeatures()
    db = dbcon.connect_torndb()
    mongo = dbcon.connect_mongo()
    vec = DictVectorizer()
    random_source = [item[0] for item in nf.source.items() if item[1] <= 20]
    setx, sety = [], []

    # print 'init'