def load_jsonlines(stream): # do batch write db = datastore.corpus_db() with db.write_batch() as wb: for n, line in enumerate(stream, 1): doc = preprocess(json.loads(line)) key = get_key(doc) wb[key] = doc return n
def do_view(self, line): """View document""" if line: try: limit = int(line) except ValueError: print "*** Invalid integer" return else: limit = 500 print get_key(self.current_doc) print self.current_doc['headline'] print self.current_doc['url'] print if len(self.current_doc['body']) > limit: print self.current_doc['body'][:limit] + '...' else: print self.current_doc['body']
def load_csv(stream): # read first row as fields fields = csv.reader(stream).next() if any(f not in fields for f in REQUIRED_FIELDS): raise ValueError( "Required fields: {}".format(','.join(REQUIRED_FIELDS)) ) reader = csv.DictReader(stream, fields) # do batch write db = datastore.corpus_db() with db.write_batch() as wb: for n, doc in enumerate(reader, 1): doc = preprocess(dict( (k, v.decode('utf-8')) for k, v in doc.iteritems() )) key = get_key(doc) wb[key] = doc return n
def main(args): dataset = load_docs(args.train_path) if args.train_size: if args.train_size <= 1: train_size = int(args.train_size * len(dataset.data)) else: train_size = int(args.train_size) else: train_size = len(dataset.data) / 2 categories = dataset.target_names print "{} categories".format(len(categories)) if args.best_parameters: cv = ShuffleSplit(len(dataset.data), n_iterations=10, test_size=.2) for name, model in MODELS: print "GridSearchCV", name grid = GridSearchCV(model, cv=cv, param_grid=GRID_PARAMETERS[name]) #n_jobs=1, score_func=metrics.auc_score, #verbose=4) with WriteRuntime("best parameters time: {elapsed:.3f}\n", sys.stdout): grid.fit(dataset.data, dataset.target) print "Best Scores:" print grid.best_score_ print grid.best_params_ return train_data, test_data = split_list(dataset.data, train_size) train_target, test_target = split_list(dataset.target, train_size) print "{} documents (training set)".format(len(train_data)) if args.classify_keys: # override test data from given keys data = load_keys(args.classify_keys, args.classify_skip, args.classify_limit) print "{} documents (classify set)".format(len(data)) if not data: return # report results for each model per document print 'Models: ' + repr([name for name, _ in MODELS]) results = [] for name, model in MODELS: # train model.fit(train_data, train_target) # classify pred = model.predict(data) results.append(pred) labels = [] for i, doc in enumerate(data): _by_model = [] for j, _ in enumerate(MODELS): _by_model.append(categories[results[j][i]]) labels.append(_by_model) data_labels = zip(data, labels) if args.report_short: for doc, doc_labels in data_labels: print '{}\t{!r}'.format(get_key(doc), doc_labels) else: #LabelValidation(args.train_path, data_labels).cmdloop() pass else: print "{} documents (testing set)".format(len(test_data)) results = [] params = (categories, train_data, train_target, test_data, test_target) for name, model in MODELS: print(80 * '=') print name model.set_params(**PARAMETERS[name]) results.append(benchmark(model, *params))
def main(args): dataset = load_docs(args.train_path) if args.train_size: if args.train_size <= 1: train_size = int(args.train_size * len(dataset.data)) else: train_size = int(args.train_size) else: train_size = len(dataset.data) / 2 categories = dataset.target_names print "{} categories".format(len(categories)) if args.best_parameters: cv = ShuffleSplit(len(dataset.data), n_iterations=10, test_size=.2) for name, model in MODELS: print "GridSearchCV", name grid = GridSearchCV(model, cv=cv, param_grid=GRID_PARAMETERS[name]) #n_jobs=1, score_func=metrics.auc_score, #verbose=4) with WriteRuntime("best parameters time: {elapsed:.3f}\n", sys.stdout): grid.fit(dataset.data, dataset.target) print "Best Scores:" print grid.best_score_ print grid.best_params_ return train_data, test_data = split_list(dataset.data, train_size) train_target, test_target = split_list(dataset.target, train_size) print "{} documents (training set)".format(len(train_data)) if args.classify_keys: # override test data from given keys data = load_keys(args.classify_keys, args.classify_skip, args.classify_limit) print "{} documents (classify set)".format(len(data)) if not data: return # report results for each model per document print 'Models: ' + repr([name for name, _ in MODELS]) results = [] for name, model in MODELS: # train model.fit(train_data, train_target) # classify pred = model.predict(data) results.append(pred) labels = [] for i, doc in enumerate(data): _by_model = [] for j, _ in enumerate(MODELS): _by_model.append(categories[results[j][i]]) labels.append(_by_model) data_labels = zip(data, labels) if args.report_short: for doc, doc_labels in data_labels: print '{}\t{!r}'.format(get_key(doc), doc_labels) else: #LabelValidation(args.train_path, data_labels).cmdloop() pass else: print "{} documents (testing set)".format(len(test_data)) results = [] params = (categories, train_data, train_target, test_data, test_target) for name, model in MODELS: print (80 * '=') print name model.set_params(**PARAMETERS[name]) results.append(benchmark(model, *params))