예제 #1
0
파일: load.py 프로젝트: rmax/yatiri
def load_jsonlines(stream):
    # do batch write
    db = datastore.corpus_db()
    with db.write_batch() as wb:
        for n, line in enumerate(stream, 1):
            doc = preprocess(json.loads(line))
            key = get_key(doc)
            wb[key] = doc
        return n
예제 #2
0
    def do_view(self, line):
        """View document"""
        if line:
            try:
                limit = int(line)
            except ValueError:
                print "*** Invalid integer"
                return
        else:
            limit = 500

        print get_key(self.current_doc)
        print self.current_doc['headline']
        print self.current_doc['url']
        print

        if len(self.current_doc['body']) > limit:
            print self.current_doc['body'][:limit] + '...'
        else:
            print self.current_doc['body']
예제 #3
0
    def do_view(self, line):
        """View document"""
        if line:
            try:
                limit = int(line)
            except ValueError:
                print "*** Invalid integer"
                return
        else:
            limit = 500

        print get_key(self.current_doc)
        print self.current_doc['headline']
        print self.current_doc['url']
        print

        if len(self.current_doc['body']) > limit:
            print self.current_doc['body'][:limit] + '...'
        else:
            print self.current_doc['body']
예제 #4
0
파일: load.py 프로젝트: rmax/yatiri
def load_csv(stream):
    # read first row as fields
    fields = csv.reader(stream).next()
    if any(f not in fields for f in REQUIRED_FIELDS):
        raise ValueError(
            "Required fields: {}".format(','.join(REQUIRED_FIELDS))
        )
    reader = csv.DictReader(stream, fields)
    # do batch write
    db = datastore.corpus_db()
    with db.write_batch() as wb:
        for n, doc in enumerate(reader, 1):
            doc = preprocess(dict(
                (k, v.decode('utf-8')) for k, v in doc.iteritems()
            ))
            key = get_key(doc)
            wb[key] = doc
        return n
예제 #5
0
def main(args):
    dataset = load_docs(args.train_path)

    if args.train_size:
        if args.train_size <= 1:
            train_size = int(args.train_size * len(dataset.data))
        else:
            train_size = int(args.train_size)
    else:
        train_size = len(dataset.data) / 2

    categories = dataset.target_names

    print "{} categories".format(len(categories))

    if args.best_parameters:
        cv = ShuffleSplit(len(dataset.data), n_iterations=10, test_size=.2)
        for name, model in MODELS:
            print "GridSearchCV", name
            grid = GridSearchCV(model, cv=cv, param_grid=GRID_PARAMETERS[name])
            #n_jobs=1, score_func=metrics.auc_score,
            #verbose=4)
            with WriteRuntime("best parameters time: {elapsed:.3f}\n",
                              sys.stdout):
                grid.fit(dataset.data, dataset.target)

            print "Best Scores:"
            print grid.best_score_
            print grid.best_params_
            return

    train_data, test_data = split_list(dataset.data, train_size)
    train_target, test_target = split_list(dataset.target, train_size)
    print "{} documents (training set)".format(len(train_data))

    if args.classify_keys:
        # override test data from given keys
        data = load_keys(args.classify_keys, args.classify_skip,
                         args.classify_limit)
        print "{} documents (classify set)".format(len(data))
        if not data:
            return
        # report results for each model per document
        print 'Models: ' + repr([name for name, _ in MODELS])

        results = []
        for name, model in MODELS:
            # train
            model.fit(train_data, train_target)
            # classify
            pred = model.predict(data)
            results.append(pred)

        labels = []
        for i, doc in enumerate(data):
            _by_model = []
            for j, _ in enumerate(MODELS):
                _by_model.append(categories[results[j][i]])
            labels.append(_by_model)

        data_labels = zip(data, labels)
        if args.report_short:
            for doc, doc_labels in data_labels:
                print '{}\t{!r}'.format(get_key(doc), doc_labels)
        else:
            #LabelValidation(args.train_path, data_labels).cmdloop()
            pass

    else:
        print "{} documents (testing set)".format(len(test_data))
        results = []
        params = (categories, train_data, train_target, test_data, test_target)
        for name, model in MODELS:
            print(80 * '=')
            print name
            model.set_params(**PARAMETERS[name])
            results.append(benchmark(model, *params))
예제 #6
0
def main(args):
    dataset = load_docs(args.train_path)

    if args.train_size:
        if args.train_size <= 1:
            train_size = int(args.train_size * len(dataset.data))
        else:
            train_size = int(args.train_size)
    else:
        train_size = len(dataset.data) / 2

    categories = dataset.target_names

    print "{} categories".format(len(categories))

    if args.best_parameters:
        cv = ShuffleSplit(len(dataset.data), n_iterations=10, test_size=.2)
        for name, model in MODELS:
            print "GridSearchCV", name
            grid = GridSearchCV(model, cv=cv,
                                param_grid=GRID_PARAMETERS[name])
                                #n_jobs=1, score_func=metrics.auc_score,
                                #verbose=4)
            with WriteRuntime("best parameters time: {elapsed:.3f}\n", sys.stdout):
                grid.fit(dataset.data, dataset.target)

            print "Best Scores:"
            print grid.best_score_
            print grid.best_params_
            return

    train_data, test_data = split_list(dataset.data, train_size)
    train_target, test_target = split_list(dataset.target, train_size)
    print "{} documents (training set)".format(len(train_data))

    if args.classify_keys:
        # override test data from given keys
        data = load_keys(args.classify_keys, args.classify_skip,
                         args.classify_limit)
        print "{} documents (classify set)".format(len(data))
        if not data:
            return
        # report results for each model per document
        print 'Models: ' + repr([name for name, _ in MODELS])

        results = []
        for name, model in MODELS:
            # train
            model.fit(train_data, train_target)
            # classify
            pred = model.predict(data)
            results.append(pred)

        labels = []
        for i, doc in enumerate(data):
            _by_model = []
            for j, _ in enumerate(MODELS):
                _by_model.append(categories[results[j][i]])
            labels.append(_by_model)

        data_labels = zip(data, labels)
        if args.report_short:
            for doc, doc_labels in data_labels:
                print '{}\t{!r}'.format(get_key(doc), doc_labels)
        else:
            #LabelValidation(args.train_path, data_labels).cmdloop()
            pass

    else:
        print "{} documents (testing set)".format(len(test_data))
        results = []
        params = (categories, train_data, train_target, test_data, test_target)
        for name, model in MODELS:
            print (80 * '=')
            print name
            model.set_params(**PARAMETERS[name])
            results.append(benchmark(model, *params))