def load_data_raw(istream): """ Load training data from a stream of input. Input is a TSV with fields (id, text, label). Returns a set of words, and input. """ log("Loading training data...") data = list(map(process_input, tqdm(RowObjectFactory.from_stream(csv.reader(istream, delimiter="\t"))))) np.random.shuffle(data) ids, X, y = zip(*data) log("Done. Loaded {} instances", len(data)) return ids, X, y
def do_run(args): """ Run the neural net to predict on new data. """ # Load the model and weights model = load_model(args.model, args.weights) wvecs = WordVectorModel.from_file(args.wvecs, False, '*UNKNOWN*') data = ((tweet.id, tokenize(to_ascii(tweet.text))) for tweet in RowObjectFactory.from_stream(csv.reader(args.input, delimiter="\t"))) writer = csv.writer(args.output, delimiter='\t') writer.writerow(['id',] + LABELS) for ix in tqdm(grouper(args.batch_size, data)): ids_batch, X_batch = zip(*ix) X_batch = wvecs.embed_sentences(X_batch) labels = model.predict_on_batch(X_batch) for id, label in zip(ids_batch, labels): writer.writerow([id,] + [float(l) for l in label])
def do_command(args): tweets = RowObjectFactory.from_stream(csv.reader(args.input, delimiter="\t")) X_train, y_train = prepare_data(tweets) np.savez(args.output, X_train = X_train, y_train = y_train)