def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading feature and label data') train_labels = np.asarray( map(int, list(file_line_generator(args.train_labels)))) train_features = np.loadtxt(args.train_data) if train_features.ndim == 1: train_features = train_features.reshape((train_features.shape[0], 1)) test_labels = np.asarray( map(int, list(file_line_generator(args.test_labels)))) test_features = np.loadtxt(args.test_data) if test_features.ndim == 1: test_features = test_features.reshape((test_features.shape[0], 1)) log.info('performing classification') single_predictions, classification_result, weight_vectors, model = \ calc_results(train_features, train_labels, test_features, test_labels, args.normalize, args.mode == True) log.info('storing results') save_object_to_file(model, os.path.join(args.output_dir, 'svm')) np.savetxt(os.path.join(args.output_dir, 'weights.csv'), weight_vectors, '%f', ';', '\n') header = 'instance_index;true_label;pred_label' np.savetxt(os.path.join(args.output_dir, 'predictions.csv'), single_predictions, '%d', ';', '\n', header=header) all_true_labels = single_predictions[:, 1] all_pred_labels = single_predictions[:, 2] confusion = confusion_matrix(all_true_labels, all_pred_labels) np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), confusion, '%d', ';', '\n') header = 'accuracy;precision;recall;f1' np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), classification_result, '%f', ';', '\n', header=header) log.info(classification_result) log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading feature and label data') train_labels = np.asarray(list(map(int, list(file_line_generator(args.train_labels))))) train_features = np.loadtxt(args.train_data) if train_features.ndim == 1: train_features = train_features.reshape((train_features.shape[0], 1)) test_labels = np.asarray(list(map(int, list(file_line_generator(args.test_labels))))) test_features = np.loadtxt(args.test_data) if test_features.ndim == 1: test_features = test_features.reshape((test_features.shape[0], 1)) log.info('performing classification') single_predictions, classification_result, weight_vectors, model = \ calc_results(train_features, train_labels, test_features, test_labels, args.normalize, args.mode == True) log.info('storing results') save_object_to_file(model, os.path.join(args.output_dir, 'svm')) np.savetxt(os.path.join(args.output_dir, 'weights.csv'), weight_vectors, '%f', ';', '\n') header = 'instance_index;true_label;pred_label' np.savetxt(os.path.join(args.output_dir, 'predictions.csv'), single_predictions, '%d', ';', '\n', header=header) all_true_labels = single_predictions[:, 1] all_pred_labels = single_predictions[:, 2] confusion = confusion_matrix(all_true_labels, all_pred_labels) np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), confusion, '%d', ';', '\n') header = 'accuracy;precision;recall;f1' np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), classification_result, '%f', ';', '\n', header=header) log.info(classification_result) log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('converting file') with utf8_file_open(args.outfile, 'w') as outfile: for line in file_line_generator(args.prediction_file): if line.startswith(u'#'): continue (_, true_label, pred_label) = line.split(';') true_label = int(true_label) pred_label = int(pred_label) tp = 1 if true_label == 1 and pred_label == 1 else 0 model_pos = 1 if pred_label == 1 else 0 gold_pos = 1 if true_label == 1 else 0 outfile.write(u'%d %d %d\n' % (tp, model_pos, gold_pos)) log.info('finished')
def getData(emb_file): """Load the data file. Parameters ---------- emb_file : str name of the data file in which the first tab-separated column contains the title and the second column the values of an item Returns ------- list(str) item titles list(ndarray) item values """ titles = [] data = [] for l in file_line_generator(emb_file): token, emb = l.split('\t') titles.append(token) data.append(np.fromstring(emb, sep=' ')) return titles, np.asarray(data)
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') if args.vocabulary is None: vocab = args.vocabulary else: vocab = read_vocabulary_id_file(args.vocabulary) text = list(file_line_generator(args.infile)) ngram_range = list(map(int, tuple(args.ngram.split(',')))) vectorizer = CountVectorizer(token_pattern='[^ ]+', min_df=0.0, vocabulary=vocab, ngram_range=ngram_range, dtype=int) log.info('creating features') bow = vectorizer.fit_transform(text) log.info('storing result') np.savetxt(args.out_feature_file, bow.todense(), fmt='%d') with utf8_file_open(args.out_feature_file + '.vocab', 'w') as vocab_file: vocab_file.write('\n'.join(vectorizer.get_feature_names())) log.info('finished')
def main(argv=None): log.info('started application') if argv is None: argv = sys.argv[1:] args = parser.parse_args() log.info('start parameters: ' + str(args)) log.info('reading index file') idx = get_indices(args.indices) max_idx = max(idx) log.info('filtering file') with utf8_file_open(args.outfile, 'w') as outfile: for (cur_idx, line) in enumerate( file_line_generator(args.infile, False)): if not args.inverse: if cur_idx in idx: outfile.write(line) if cur_idx >= max_idx: break else: if cur_idx not in idx: outfile.write(line) log.info('finished')
def getData(emb_file): """Load the data file. Parameters ---------- emb_file : str name of the data file in which the first tab-separated column contains the title and the second column the values of an item Returns ------- list(str) item titles list(ndarray) item values """ titles = [] data = [] for l in file_line_generator(emb_file): token, emb = l.split(u'\t') titles.append(token) data.append(np.fromstring(emb, sep=u' ')) return titles, np.asarray(data)
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading feature and label data') labels = np.asarray(map(int, list(file_line_generator(args.label_file)))) log.info('performing cross validation') single_predictions, classification_result = do_cross_validation(labels) log.info('storing results') header = 'fold_no;instance_index;true_label;pred_label' np.savetxt(os.path.join(args.output_dir, 'predictions.csv'), single_predictions, '%d', ';', '\n', header=header) all_true_labels = single_predictions[:, 2] all_pred_labels = single_predictions[:, 3] confusion = confusion_matrix(all_true_labels, all_pred_labels) np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'), confusion, '%d', ';', '\n') classification_result[NO_OF_FOLDS, :] = get_classification_result(-1, all_true_labels, all_pred_labels) header = 'fold_no;accuracy;precision;recall;f1' np.savetxt(os.path.join(args.output_dir, 'metrics.csv'), classification_result, '%f', ';', '\n', header=header) log.info(classification_result) log.info('finished')
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') if args.vocabulary is None: vocab = args.vocabulary else: vocab = read_vocabulary_id_file(args.vocabulary) text = list(file_line_generator(args.infile)) ngram_range = map(int, tuple(args.ngram.split(','))) vectorizer = CountVectorizer(token_pattern='[^ ]+', min_df=0.0, vocabulary=vocab, ngram_range=ngram_range, dtype=int) log.info('creating features') bow = vectorizer.fit_transform(text) log.info('storing result') np.savetxt(args.out_feature_file, bow.todense(), fmt='%d') with utf8_file_open(args.out_feature_file + '.vocab', 'w') as vocab_file: vocab_file.write(u'\n'.join(vectorizer.get_feature_names())) log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('converting file') with utf8_file_open(args.outfile, 'w') as outfile: for line in file_line_generator(args.prediction_file): if line.startswith('#'): continue (_, true_label, pred_label) = line.split(';') true_label = int(true_label) pred_label = int(pred_label) tp = 1 if true_label == 1 and pred_label == 1 else 0 model_pos = 1 if pred_label == 1 else 0 gold_pos = 1 if true_label == 1 else 0 outfile.write('%d %d %d\n' % (tp, model_pos, gold_pos)) log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') items = [] for line in file_line_generator(args.data_file): items.append(tuple(line.split())) log.info('compute majority labels') cluster_to_label_count = defaultdict(Counter) # Count labels per cluster for (label, cluster_id) in items: cluster_to_label_count[cluster_id][label] += 1 majority_labels = dict() # Get majority label per cluster for cluster_id in cluster_to_label_count: majority_labels[cluster_id] = cluster_to_label_count[cluster_id].most_common(1)[0][0] log.info('assign labels to examples') with utf8_file_open(args.predicted_labels, 'w') as pred_file: for example_line in file_line_generator(args.data_file): pred_file.write(majority_labels[example_line.split()[1]] + '\n') if args.cluster_labels: with utf8_file_open(args.cluster_labels, 'w') as outfile: for (cluster_id, label) in sort_dict_by_key(majority_labels): outfile.write('%s %s\n' % (cluster_id, label)) log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') items = [] for line in file_line_generator(args.data_file): items.append(tuple(line.split())) log.info('compute majority labels') cluster_to_label_count = defaultdict(Counter) # Count labels per cluster for (label, cluster_id) in items: cluster_to_label_count[cluster_id][label] += 1 majority_labels = dict() # Get majority label per cluster for cluster_id in cluster_to_label_count: majority_labels[cluster_id] = cluster_to_label_count[ cluster_id].most_common(1)[0][0] log.info('assign labels to examples') with utf8_file_open(args.predicted_labels, 'w') as pred_file: for example_line in file_line_generator(args.data_file): pred_file.write(majority_labels[example_line.split()[1]] + u'\n') if args.cluster_labels: with utf8_file_open(args.cluster_labels, 'w') as outfile: for (cluster_id, label) in sort_dict_by_key(majority_labels): outfile.write(u'%s %s\n' % (cluster_id, label)) log.info('finished')
def read_unigram_frequencies(filename): """Read the unigram frequencies for all vocabulary items from the file. 1 frequency per line. Caution: Don't forget to add the 4 special tokens, e.g., <UNK>. Besides <UNK> we don't want to draw them as noise, therefore they should have a count of 0. """ unigram_dist = [] for line in file_line_generator(filename): unigram_dist.append(int(line)) return unigram_dist
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') file2_content = list(file_line_generator(args.file2)) log.info('combining files') with utf8_file_open(args.out_file, 'w') as outfile: for c, line1 in enumerate(file_line_generator(args.file1)): log_iterations(log, c, 1000) for line2 in file2_content: outfile.write(line1 + args.separator + line2 + u'\n') log.info('finished')
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') file2_content = list(file_line_generator(args.file2)) log.info('combining files') with utf8_file_open(args.out_file, 'w') as outfile: for c, line1 in enumerate(file_line_generator(args.file1)): log_iterations(log, c, 1000) for line2 in file2_content: outfile.write(line1 + args.separator + line2 + '\n') log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') true = [] pred = [] for line in file_line_generator(args.true_labels): true.append(line) for line in file_line_generator(args.pred_labels): pred.append(line) acc = accuracy_score(true, pred) log.info('accuracy: %f' % acc) if args.precision or args.recall or args.f_measure: p, r, f, _ = precision_recall_fscore_support( true, pred, args.beta, pos_label=args.pos_label, average=None if not args.avg else args.avg) if args.precision: log.info('precision: %f' % p) if args.recall: log.info('recall: %f' % r) if args.f_measure: log.info('f-measure: %f' % f) log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') vocab = embeddings.read_vocabulary_file(args.vocabulary, False) contexts = list(file_line_generator(args.contexts)) dists = np.loadtxt(args.distributions) log.info('computing results') # Add X in the n-grams' centers # Assume we have the same context size left and right. x_pos = len(contexts[0].split()) // 2 contexts = [ sp[:x_pos] + ['X'] + sp[x_pos:] for sp in [c.split() for c in contexts] ] # Sorts all words for each context descending. sort_words_per_context_value = np.sort(dists, 1)[:, ::-1] sort_words_per_context_idx = np.argsort(dists, 1)[:, ::-1] # Sorts all contexts according to their probability assigned to "similar". sort_context_for_similar_idx = np.argsort(dists[:, 465])[::-1] sort_context_for_similar_value = np.sort(dists[:, 465])[::-1] log.info('writing data data') with utf8_file_open(args.out_file, 'w') as likelihood_file: # Write results to a file for (i, idx) in enumerate(sort_context_for_similar_idx): likelihood_file.write(u' '.join(contexts[idx]) + u'\t' + unicode(sort_context_for_similar_value[i]) + u'\n') # 10 most likely words for the current context for j in xrange(10): likelihood_file.write( vocab[sort_words_per_context_idx[idx, j]] + u'\t' + unicode(sort_words_per_context_value[idx, j]) + u'\n') likelihood_file.write(u'\n') log.info('finished')
def configure(self, args): super(vLblNCEPredictor, self).configure(args) self.vocab = read_vocabulary_id_file(args.vocabulary) self.vocab_size = len(self.vocab.keys()) self.effective_vocab_size = len(self.vocab.keys()) self.perplexity = args.perplexity self.save_word = args.save_word self.result_file = args.result_file self.store_rank = args.store_rank self.store_argmax = args.store_argmax self.store_softmax = args.store_softmax self.normalize_with_root = args.normalize_with_root self.information = args.information self.predictions = args.predictions # This code is taken from SimpleVLblNceTrainer if args.pred_vocab: # Element i contains the index of the i'th prediction vocabulary # token in the original vocabulary. self.vocab_mapping_list = list() # Mapping from the model vocabulary to the prediction vocabulary # indices self.vocab_mapping = dict() for i, token in enumerate(file_line_generator(args.pred_vocab)): if not token in self.vocab: raise ValueError('Token "%s" in prediction vocabulary ' + 'does not exist in model vocabulary.' % token) self.vocab_mapping_list.append(self.vocab[token]) self.vocab_mapping[self.vocab[token]] = i else: self.vocab_mapping_list = range(len(self.vocab)) self.vocab_mapping = dict( zip(self.vocab_mapping_list, self.vocab_mapping_list)) if self.perplexity: self.example_iterator_type = PaddedWindowExamplesGenerator self.example_processor = self._process_example_full_text self.learn_eos = True # We need to set that because otherwise PaddedWindowExampleGenerator will ignore end-of-sentence tags (</S>) self.disable_padding = False self.w_indices = debug_print(T.imatrix('w'), 'w') self.inputs.append(self.w_indices) else: self.example_processor = self._process_example_context_per_line
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') vocab = embeddings.read_vocabulary_file(args.vocabulary, False) contexts = list(file_line_generator(args.contexts)) dists = np.loadtxt(args.distributions) log.info('computing results') # Add X in the n-grams' centers # Assume we have the same context size left and right. x_pos = len(contexts[0].split()) // 2 contexts = [sp[:x_pos] + ['X'] + sp[x_pos:] for sp in [c.split() for c in contexts]] # Sorts all words for each context descending. sort_words_per_context_value = np.sort(dists, 1)[: , ::-1] sort_words_per_context_idx = np.argsort(dists, 1)[: , ::-1] # Sorts all contexts according to their probability assigned to "similar". sort_context_for_similar_idx = np.argsort(dists[:, 465])[::-1] sort_context_for_similar_value = np.sort(dists[:, 465])[::-1] log.info('writing data data') with utf8_file_open(args.out_file, 'w') as likelihood_file: # Write results to a file for (i, idx) in enumerate(sort_context_for_similar_idx): likelihood_file.write(u' '.join(contexts[idx]) + u'\t' + unicode(sort_context_for_similar_value[i]) + u'\n') # 10 most likely words for the current context for j in xrange(10): likelihood_file.write(vocab[sort_words_per_context_idx[idx, j]] + u'\t' + unicode(sort_words_per_context_value[idx, j]) + u'\n') likelihood_file.write(u'\n') log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('preprocessing data') if args.amazon is True: line_iterator = \ AmazonProductReviewCorpusReader(args.infile).review_generator() else: line_iterator = file_line_generator(args.infile) if args.sentence_splitter: sent_splitter = nltk.data.load(args.sentence_splitter) with utf8_file_open(args.outfile, 'w') as outfile: for (i, line) in enumerate(line_iterator): log_iterations(log, i, 100000) if args.replace_digits: line = re.sub(r'\d', args.replace_digits, line, 0, REGEX_FLAGS) if args.strip_html: line = nltk.clean_html(line) if args.sentence_splitter: line = sent_splitter.tokenize(line) else: line = [line] if args.tokenize: line = [tokenize(l) for l in line] if not args.tokenize: outfile.write('\n'.join(line)) else: outfile.write('\n'.join([' '.join(l) for l in line])) outfile.write('\n') log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('preprocessing data') if args.amazon is True: line_iterator = \ AmazonProductReviewCorpusReader(args.infile).review_generator() else: line_iterator = file_line_generator(args.infile) if args.sentence_splitter: sent_splitter = nltk.data.load(args.sentence_splitter) with utf8_file_open(args.outfile, 'w') as outfile: for (i, line) in enumerate(line_iterator): log_iterations(log, i, 100000) if args.replace_digits: line = re.sub(r'\d', args.replace_digits, line, 0, REGEX_FLAGS) if args.strip_html: line = nltk.clean_html(line) if args.sentence_splitter: line = sent_splitter.tokenize(line) else: line = [line] if args.tokenize: line = [tokenize(l) for l in line] if not args.tokenize: outfile.write(u'\n'.join(line)) else: outfile.write(u'\n'.join([u' '.join(l) for l in line])) outfile.write(u'\n') log.info('finished')
def review_generator(self, remove_meta_cols=True): """Iterate over all reviews Parameters ---------- remove_meta_cols : bool indicates whether or not to remove the first 7 meta data columns """ for line in file_line_generator(self.infile): line = line.decode(errors='ignore') if remove_meta_cols is True: line = self._extract_body(line) yield line raise StopIteration()
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) vocab = pd.Series(file_line_generator(args.vocabulary, comment='##')) with open(args.infile, 'rb') as infile: integers = np.fromfile(infile, np.int32) with utf8_file_open(args.outfile, 'w') as outfile: outfile.write(u'\n'.join(vocab[integers])) outfile.write(u'\n') log.info('finished')
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('transforming data') with utf8_file_open(args.outfile, 'w') as outfile: for line in file_line_generator(args.infile): token, signature = line.split(u'\t') outfile.write(u'%s\t%s\n' % (token, prepare_brown_signature(signature, args.max_size, args.right))) log.info('finished')
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) vocab = pd.Series(file_line_generator(args.vocabulary, comment='##')) with open(args.infile, 'rb') as infile: integers = np.fromfile(infile, np.int32) with utf8_file_open(args.outfile, 'w') as outfile: outfile.write('\n'.join(vocab[integers])) outfile.write('\n') log.info('finished')
def get_indices(indices): """Generates line indices to keep. Parameters ---------- indices : str either name of a file containing indices one per line or a comma separated string Returns ------- int next index """ if os.path.exists(indices): return set(map(int, file_line_generator(indices, True))) return set((int(i.strip()) for i in indices.split(',')))
def get_indices(indices): """Generates line indices to keep. Parameters ---------- indices : str either name of a file containing indices one per line or a comma separated string Returns ------- int next index """ if os.path.exists(indices): return set(map(int, file_line_generator(indices, True))) return set((int(i.strip()) for i in indices.split(u',')))
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') vocab = read_vocabulary_id_file(args.vocabulary, False) _, ext = os.path.splitext(args.feature_file) if ext == 'npy': features = np.load(args.feature_file) else: features = np.loadtxt(args.feature_file) log.info('creating features') with utf8_file_open(args.out_feature_file, 'w') as outfile: for line in file_line_generator(args.infile): toks = line.split() cur_features = np.zeros((len(toks), features.shape[1])) for (i, tok) in enumerate(toks): cur_features[i, :] = features[vocab.get( tok, SpecialTokenID.UNKNOWN.value)] if args.avg: res = ndarray_to_string(np.mean(cur_features, axis=0)) else: res = ndarray_to_string( np.reshape(cur_features, np.prod(cur_features.shape), order='C')) outfile.write(res + u'\n') log.info('finished')
def main(argv=None): """See argument parser description.""" if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') vocab = read_vocabulary_id_file(args.vocabulary, False) _, ext = os.path.splitext(args.feature_file) if ext == 'npy': features = np.load(args.feature_file) else: features = np.loadtxt(args.feature_file) log.info('creating features') with utf8_file_open(args.out_feature_file, 'w') as outfile: for line in file_line_generator(args.infile): toks = line.split() cur_features = np.zeros((len(toks), features.shape[1])) for (i, tok) in enumerate(toks): cur_features[i, :] = features[ vocab.get(tok, SpecialTokenID.UNKNOWN.value)] if args.avg: res = ndarray_to_string(np.mean(cur_features, axis=0)) else: res = ndarray_to_string(np.reshape(cur_features, np.prod(cur_features.shape), order='C')) outfile.write(res + u'\n') log.info('finished')
def read_vocabulary_file(input_file, add_special_tokens=True): """Read the textual vocabulary into a list. Items that are empty after calling str.strip on them will be mapped to u'<EMPTY>'. Parameters ---------- input_file : str location of the vocabulary add_special_tokens : bool indicates whether or not to add special tokens to the front of the vocabulary, like <UNK> for unknown tokens, etc. Returns ------- list(str) vocabulary from token to unique id """ vocab = list(file_line_generator(input_file)) if add_special_tokens: _add_special_tokens(vocab) return [v.strip() if v.strip() else '<EMPTY>' for v in vocab]
def read_vocabulary_file(input_file, add_special_tokens=True): """Read the textual vocabulary into a list. Items that are empty after calling str.strip on them will be mapped to u'<EMPTY>'. Parameters ---------- input_file : str location of the vocabulary add_special_tokens : bool indicates whether or not to add special tokens to the front of the vocabulary, like <UNK> for unknown tokens, etc. Returns ------- list(str) vocabulary from token to unique id """ vocab = list(file_line_generator(input_file)) if add_special_tokens: _add_special_tokens(vocab) return [v.strip() if v.strip() else u'<EMPTY>' for v in vocab]
def calc_matrix_statistics(matrix_file): """Calculates some basic statistics for huge matrix files. If a matrix is too big to be imported into a program, use this method to calculate the mean, maximum, minimum, and standard deviation of every line in the file. It returns a generator. Parameters ---------- matrix_file : str filename of the matrix file; the file must be a csv file with spaces as separator Returns ------- generator : (float, float, float, float) mean, max, min, std_dev of current line in the matrix file """ for line in file_line_generator(matrix_file): a = np.fromstring(line, sep=u' ') yield (np.mean(a), np.max(a), np.min(a), np.std(a)) raise StopIteration()
def calc_matrix_statistics(matrix_file): """Calculates some basic statistics for huge matrix files. If a matrix is too big to be imported into a program, use this method to calculate the mean, maximum, minimum, and standard deviation of every line in the file. It returns a generator. Parameters ---------- matrix_file : str filename of the matrix file; the file must be a csv file with spaces as separator Returns ------- generator : (float, float, float, float) mean, max, min, std_dev of current line in the matrix file """ for line in file_line_generator(matrix_file): a = np.fromstring(line, sep=' ') yield (np.mean(a), np.max(a), np.min(a), np.std(a)) raise StopIteration()
def example_iter(self, filename): for example in file_line_generator(filename): yield example raise StopIteration
def extract_results_from_logfile(logfile, result='train_error', fmt='new', no_of_val_files=1): """Extract results from a given logfile and returns them as ndarray. Parameters ---------- logfile : str path of the logfile result : str type of the result to be extracted; one of 'train_error', 'val_error', 'val_ppl' format : str 'new' or 'old', new format allows several validation files; old format only allowed 1 validation file. no_of_val_files : int number of validation files used in the logfile; is only matters if result = 'val_error' or 'val_perplexity' Returns ------- ndarray contains all results in an array """ if fmt == 'old': val_method_name = 'validate' else: val_method_name = '_validate_single_file' if result == 'train_error': pattern = re.compile(r'run\tAverage loss on .*? training set is (.*)', re.UNICODE) elif result == 'val_error': pattern = re.compile( r'%s\tAverage loss on .*? validation set is (.*)' % val_method_name, re.UNICODE) elif result == 'val_ppl': pattern = re.compile( r'%s\tPerplexity on .*? validation set is (.*)' % val_method_name, re.UNICODE) else: raise ValueError( 'Unknown result type to be extracted from logfile: %s' % result) values = list() for line in file_line_generator(logfile): match = re.search(pattern, line) if not match: continue values.append(float(match.group(1))) # Converts the 1d list of results into one list per validation file. if (result == 'val_error' or result == 'val_ppl') and no_of_val_files != 1: values = list(grouper_recipes(values, no_of_val_files)) values = zip(*values) return values
def extract_results_from_logfile(logfile, result='train_error', fmt='new', no_of_val_files=1): """Extract results from a given logfile and returns them as ndarray. Parameters ---------- logfile : str path of the logfile result : str type of the result to be extracted; one of 'train_error', 'val_error', 'val_ppl' format : str 'new' or 'old', new format allows several validation files; old format only allowed 1 validation file. no_of_val_files : int number of validation files used in the logfile; is only matters if result = 'val_error' or 'val_perplexity' Returns ------- ndarray contains all results in an array """ if fmt == 'old': val_method_name = 'validate' else: val_method_name = '_validate_single_file' if result == 'train_error': pattern = re.compile(r'run\tAverage loss on .*? training set is (.*)', re.UNICODE) elif result == 'val_error': pattern = re.compile( r'%s\tAverage loss on .*? validation set is (.*)' % val_method_name, re.UNICODE) elif result == 'val_ppl': pattern = re.compile( r'%s\tPerplexity on .*? validation set is (.*)' % val_method_name, re.UNICODE) else: raise ValueError('Unknown result type to be extracted from logfile: %s' % result) values = list() for line in file_line_generator(logfile): match = re.search(pattern, line) if not match: continue values.append(float(match.group(1))) # Converts the 1d list of results into one list per validation file. if (result == 'val_error' or result == 'val_ppl') and no_of_val_files != 1: values = list(grouper_recipes(values, no_of_val_files)) values = zip(*values) return values