def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') if args.vocabulary is None: vocab = args.vocabulary else: vocab = read_vocabulary_id_file(args.vocabulary) text = list(file_line_generator(args.infile)) ngram_range = map(int, tuple(args.ngram.split(','))) vectorizer = CountVectorizer(token_pattern='[^ ]+', min_df=0.0, vocabulary=vocab, ngram_range=ngram_range, dtype=int) log.info('creating features') bow = vectorizer.fit_transform(text) log.info('storing result') np.savetxt(args.out_feature_file, bow.todense(), fmt='%d') with utf8_file_open(args.out_feature_file + '.vocab', 'w') as vocab_file: vocab_file.write(u'\n'.join(vectorizer.get_feature_names())) log.info('finished')
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') if args.vocabulary is None: vocab = args.vocabulary else: vocab = read_vocabulary_id_file(args.vocabulary) text = list(file_line_generator(args.infile)) ngram_range = list(map(int, tuple(args.ngram.split(',')))) vectorizer = CountVectorizer(token_pattern='[^ ]+', min_df=0.0, vocabulary=vocab, ngram_range=ngram_range, dtype=int) log.info('creating features') bow = vectorizer.fit_transform(text) log.info('storing result') np.savetxt(args.out_feature_file, bow.todense(), fmt='%d') with utf8_file_open(args.out_feature_file + '.vocab', 'w') as vocab_file: vocab_file.write('\n'.join(vectorizer.get_feature_names())) log.info('finished')
def configure(self, args): super(EmbeddingsMiniBatchTrainer, self).configure(args) self.vocab = read_vocabulary_id_file(args.vocabulary) self.vocab_size = len(self.vocab.keys()) self.effective_vocab_size = len(self.vocab.keys()) self.word_embedding_size = args.word_embedding_size self.do_dump_vocabulary = args.dump_vocabulary self.do_dump_embeddings = args.dump_embeddings log.debug('Effective size of the vocabulary %d', self.effective_vocab_size)
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading embeddings') vocab = read_vocabulary_id_file(args.vocabulary) embs = np.loadtxt(args.embeddings) log.info('loading documents') features, labels = load_data(args.corpus_dir, vocab, embs) log.info('performing cross validation') single_predictions, classification_result, weight_vectors = \ do_cross_validation(features, labels) log.info('storing results') np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'), weight_vectors, '%f', ';', '\n') with utf8_file_open(os.path.join(args.output_dir, 'predictions.csv'), 'w') \ as pred_file: pred_file.write(u'fold_no;doc;true_label;pred_label\n') for sp in single_predictions: pred_file.write(u';'.join(map(unicode, sp)) + u'\n') all_true_labels = [sp[2] for sp in single_predictions] all_pred_labels = [sp[3] for sp in single_predictions] confusion = confusion_matrix(all_true_labels, all_pred_labels) np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), confusion, '%d', ';', '\n') classification_result[NO_OF_FOLDS, :] = get_classification_result( -1, all_true_labels, all_pred_labels) header = u'fold_no;accuracy;precision;recall;f1' np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), classification_result, '%f', u';', u'\n', header=header) log.info(classification_result) log.info('finished')
def configure(self, args): super(vLblNCEPredictor, self).configure(args) self.vocab = read_vocabulary_id_file(args.vocabulary) self.vocab_size = len(self.vocab.keys()) self.effective_vocab_size = len(self.vocab.keys()) self.perplexity = args.perplexity self.save_word = args.save_word self.result_file = args.result_file self.store_rank = args.store_rank self.store_argmax = args.store_argmax self.store_softmax = args.store_softmax self.normalize_with_root = args.normalize_with_root self.information = args.information self.predictions = args.predictions # This code is taken from SimpleVLblNceTrainer if args.pred_vocab: # Element i contains the index of the i'th prediction vocabulary # token in the original vocabulary. self.vocab_mapping_list = list() # Mapping from the model vocabulary to the prediction vocabulary # indices self.vocab_mapping = dict() for i, token in enumerate(file_line_generator(args.pred_vocab)): if not token in self.vocab: raise ValueError('Token "%s" in prediction vocabulary ' + 'does not exist in model vocabulary.' % token) self.vocab_mapping_list.append(self.vocab[token]) self.vocab_mapping[self.vocab[token]] = i else: self.vocab_mapping_list = range(len(self.vocab)) self.vocab_mapping = dict( zip(self.vocab_mapping_list, self.vocab_mapping_list)) if self.perplexity: self.example_iterator_type = PaddedWindowExamplesGenerator self.example_processor = self._process_example_full_text self.learn_eos = True # We need to set that because otherwise PaddedWindowExampleGenerator will ignore end-of-sentence tags (</S>) self.disable_padding = False self.w_indices = debug_print(T.imatrix('w'), 'w') self.inputs.append(self.w_indices) else: self.example_processor = self._process_example_context_per_line
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading embeddings') vocab = read_vocabulary_id_file(args.vocabulary) embs = np.loadtxt(args.embeddings) log.info('loading documents') features, labels = load_data(args.corpus_dir, vocab, embs) log.info('performing cross validation') single_predictions, classification_result, weight_vectors = \ do_cross_validation(features, labels) log.info('storing results') np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'), weight_vectors, '%f', ';', '\n') with utf8_file_open(os.path.join(args.output_dir, 'predictions.csv'), 'w') \ as pred_file: pred_file.write(u'fold_no;doc;true_label;pred_label\n') for sp in single_predictions: pred_file.write(u';'.join(map(unicode, sp)) + u'\n') all_true_labels = [sp[2] for sp in single_predictions] all_pred_labels = [sp[3] for sp in single_predictions] confusion = confusion_matrix(all_true_labels, all_pred_labels) np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), confusion, '%d', ';', '\n') classification_result[NO_OF_FOLDS, :] = get_classification_result(-1, all_true_labels, all_pred_labels) header = u'fold_no;accuracy;precision;recall;f1' np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), classification_result, '%f', u';', u'\n', header=header) log.info(classification_result) log.info('finished')
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') vocab = read_vocabulary_id_file(args.vocabulary, False) _, ext = os.path.splitext(args.feature_file) if ext == 'npy': features = np.load(args.feature_file) else: features = np.loadtxt(args.feature_file) log.info('creating features') with utf8_file_open(args.out_feature_file, 'w') as outfile: for line in file_line_generator(args.infile): toks = line.split() cur_features = np.zeros((len(toks), features.shape[1])) for (i, tok) in enumerate(toks): cur_features[i, :] = features[vocab.get( tok, SpecialTokenID.UNKNOWN.value)] if args.avg: res = ndarray_to_string(np.mean(cur_features, axis=0)) else: res = ndarray_to_string( np.reshape(cur_features, np.prod(cur_features.shape), order='C')) outfile.write(res + u'\n') log.info('finished')
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') vocab = read_vocabulary_id_file(args.vocabulary, False) _, ext = os.path.splitext(args.feature_file) if ext == 'npy': features = np.load(args.feature_file) else: features = np.loadtxt(args.feature_file) log.info('creating features') with utf8_file_open(args.out_feature_file, 'w') as outfile: for line in file_line_generator(args.infile): toks = line.split() cur_features = np.zeros((len(toks), features.shape[1])) for (i, tok) in enumerate(toks): cur_features[i, :] = features[ vocab.get(tok, SpecialTokenID.UNKNOWN.value)] if args.avg: res = ndarray_to_string(np.mean(cur_features, axis=0)) else: res = ndarray_to_string(np.reshape(cur_features, np.prod(cur_features.shape), order='C')) outfile.write(res + u'\n') log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') model = load_object_from_file(args.model_file) # read vocabulary from file vocab = sort_dict_by_label(read_vocabulary_id_file(args.vocabulary)) # get matrices from model r_matrix = model.R.get_value() q_matrix = model.Q.get_value() # get input embeddings if args.model_type == 'vlbl': in_we = r_matrix elif args.model_type == 'vlbl_dist': # this will not work with the old versions of models - because of sparsity d_matrix = model.D.get_value().todense() in_we = np.dot(d_matrix, r_matrix) # need to convert from numpy.matrix to numpy.ndarray in_we = in_we.view(type=np.ndarray) with utf8_file_open(args.result_file + ".in", 'w') as outfile: for (word, ind) in vocab: outfile.write( unicode(word) + u' ' + u' '.join(map(str, in_we[ind])) + u'\n') with utf8_file_open(args.result_file + ".out", 'w') as outfile: for (word, ind) in vocab: outfile.write( unicode(word) + u' ' + u' '.join(map(str, q_matrix[ind])) + u'\n') log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') model = load_object_from_file(args.model_file) # read vocabulary from file vocab = sort_dict_by_label(read_vocabulary_id_file(args.vocabulary)) # get matrices from model r_matrix = model.R.get_value() q_matrix = model.Q.get_value() # get input embeddings if args.model_type == 'vlbl': in_we = r_matrix elif args.model_type == 'vlbl_dist': # this will not work with the old versions of models - because of sparsity d_matrix = model.D.get_value().todense() in_we = np.dot(d_matrix, r_matrix) # need to convert from numpy.matrix to numpy.ndarray in_we = in_we.view(type=np.ndarray) with utf8_file_open(args.result_file + ".in", 'w') as outfile: for (word, ind) in vocab: outfile.write(unicode(word) + u' ' + u' '.join(map(str, in_we[ind])) + u'\n') with utf8_file_open(args.result_file + ".out", 'w') as outfile: for (word, ind) in vocab: outfile.write(unicode(word) + u' ' + u' '.join(map(str, q_matrix[ind])) + u'\n') log.info('finished')
def configure(self, args): super(LblPredictor, self).configure(args) self.vocab = read_vocabulary_id_file(args.vocabulary) self.vocab_size = len(self.vocab.keys()) self.effective_vocab_size = len(self.vocab.keys())
def configure(self, args): self.sent_vocab = set(read_vocabulary_id_file(args.sent_vocab, False)) super(HingeSentimentMiniBatchTrainer, self).configure(args)