Exemplo n.º 1
0
def run(args):
    enum = enumerator()
    data = list(vectorize_sentences(enum, chain(*(read_json_lines(fn) for fn in args.input))))
    X, y = zip(*data)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)
    pickle.dump((X_train, y_train), args.output)
    pickle.dump((X_test, y_test), args.output)
Exemplo n.º 2
0
 def fit(self, X, y=None):
     enum = self.vocabulary_ or enumerator()
     for row in X:
         for lbl in row:
             enum[lbl]
     self.vocabulary_ = enum
     self.feature_names_ = enum.keys()
     return self
Exemplo n.º 3
0
 def fit(self, X, y=None):
     enum = self.vocabulary_ or enumerator()
     for row in X:
         for lbl in row:
             enum[lbl]
     self.vocabulary_ = enum
     self.feature_names_ = enum.keys()
     return self
Exemplo n.º 4
0
def run(args):
    enum = enumerator()
    data = list(
        vectorize_sentences(enum,
                            chain(*(read_json_lines(fn)
                                    for fn in args.input))))
    X, y = zip(*data)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    pickle.dump((X_train, y_train), args.output)
    pickle.dump((X_test, y_test), args.output)
Exemplo n.º 5
0
def vectorize_sentences(input_iter):
    enum = enumerator()
    for obj in input_iter:
        yield ([enum[w] for w in chain(*obj['X'])], obj['Y'])