def run(args): data = list(vectorize_sentences(chain(*(read_json_lines(fn) for fn in args.input)))) X, y = zip(*data) cfg = CONFIG['train'] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=cfg['test_size'], random_state=cfg['random_state']) pickle.dump((X_train, y_train), args.output) pickle.dump((X_test, y_test), args.output)
def run(args): enum = enumerator() data = list(vectorize_sentences(enum, chain(*(read_json_lines(fn) for fn in args.input)))) X, y = zip(*data) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) pickle.dump((X_train, y_train), args.output) pickle.dump((X_test, y_test), args.output)
def do_reducer(args): import pandas as pd obj = ndjson2col(read_json_lines(args.input)) df = pd.DataFrame.from_dict(obj) csv_path = os.path.join(args.output, "summary.csv") logging.info("Writing brief summary to %s", csv_path) df.to_csv(csv_path) create_plots(args, df)
def sample_by_y(args): sample = chain.from_iterable(read_json_lines(x) for x in args.sentences) cfg = CONFIG['train'] label_counts = cfg.get('sample_labeled') if label_counts: sample = reservoir_dict(sample, "Y", label_counts, random_state=cfg['random_state']) sentences, yvals = zip(*[(obj['X'], obj['Y']) for obj in sample]) y_labels = np.array(yvals, dtype=float) return sentences, y_labels
def do_reducer(args): import pandas as pd obj = ndjson2col(read_json_lines(args.input)) df = pd.DataFrame.from_dict(obj) subset = get_df_subset( df, [args.group_by, args.x_axis, args.trial] + args.metrics) csv_path = os.path.join(args.output, "summary.csv") logging.info("Writing brief summary to %s", csv_path) subset.to_csv(csv_path) create_plots(args, subset, args.metrics)
def run(args): enum = enumerator() data = list( vectorize_sentences(enum, chain(*(read_json_lines(fn) for fn in args.input)))) X, y = zip(*data) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) pickle.dump((X_train, y_train), args.output) pickle.dump((X_test, y_test), args.output)
def get_data(args): feature_set_names = CONFIG['train']['features'] if set(feature_set_names).intersection(['word2vec', 'doc2vec' ]) and not args.embedding: raise RuntimeError("--embedding argument must be supplied") # get Y labels training_set = read_tsv(args.train) y_labels = training_set["sentiment"] sentences = [obj['review'] for obj in read_json_lines(args.sentences)] if not args.embedding or feature_set_names == ['bow']: # don't drop NaNs -- have a sparse matrix here return False, (get_bow_features(sentences), y_labels) # load embedding if CONFIG['pretrain']['algorithm'] == 'word2vec': embedding = word2vec.Word2Vec.load(args.embedding) elif CONFIG['pretrain']['algorithm'] == 'glove': embedding = Glove.load(args.embedding) # dynamicaly add GloveWrapper mixin embedding.__class__ = type('MyGlove', (Glove, GloveWrapper), {}) # get feature vectors if 'doc2vec' in CONFIG['train']['features']: embedding_vectors = get_doc2vec_features(sentences, embedding) elif 'word2vec' in CONFIG['train']['features']: embedding_vectors = get_word2vec_features(sentences, embedding) else: raise RuntimeError("Invalid config setting train:features=%s" % CONFIG['train']['features']) if 'bow' in feature_set_names: return True, get_mixed_features(sentences, embedding_vectors, y_labels) else: # matrix is dense -- drop NaNs return False, drop_nans(embedding_vectors, y_labels)
def get_data(args): feature_set_names = CONFIG['train']['features'] if set(feature_set_names).intersection(['word2vec', 'doc2vec']) and not args.embedding: raise RuntimeError("--embedding argument must be supplied") # get Y labels training_set = read_tsv(args.train) y_labels = training_set["sentiment"] sentences = [obj['review'] for obj in read_json_lines(args.sentences)] if not args.embedding or feature_set_names == ['bow']: # don't drop NaNs -- have a sparse matrix here return False, (get_bow_features(sentences), y_labels) # load embedding if CONFIG['pretrain']['algorithm'] == 'word2vec': embedding = word2vec.Word2Vec.load(args.embedding) elif CONFIG['pretrain']['algorithm'] == 'glove': embedding = Glove.load(args.embedding) # dynamicaly add GloveWrapper mixin embedding.__class__ = type('MyGlove', (Glove, GloveWrapper), {}) # get feature vectors if 'doc2vec' in CONFIG['train']['features']: embedding_vectors = get_doc2vec_features(sentences, embedding) elif 'word2vec' in CONFIG['train']['features']: embedding_vectors = get_word2vec_features(sentences, embedding) else: raise RuntimeError("Invalid config setting train:features=%s" % CONFIG['train']['features']) if 'bow' in feature_set_names: return True, get_mixed_features(sentences, embedding_vectors, y_labels) else: # matrix is dense -- drop NaNs return False, drop_nans(embedding_vectors, y_labels)
def json_field_iter(files, field=None): for fname in files: for doc in read_json_lines(fname): yield doc if field is None else doc[field]
def doc_iter(args): field = args.field for fname in args.input: for doc in read_json_lines(fname): yield doc[field]