def load_files(src, tgt): """ loads src and tgt files. """ with open(src) as a, open(tgt) as b: out = [x for x in tqdm(zip(a, b), total=get_len(src), desc="Files")] return [x[0] for x in out], [x[1] for x in out]
def load_dataset(reference_doc, hypothesis_doc=None, lowercase=False): """ Since it is often the case that processing of the sequences is more expensive than loading the dataset, we load the whole datasets in memory first. """ sequences = [] ignored = [] count = 0 len_a = get_len(reference_doc) load_a = open(reference_doc, "r") if hypothesis_doc: len_b = get_len(hypothesis_doc) if len_a != len_b: print("[Warning] The datasets are not equal in length.") load_b = open(hypothesis_doc, "r") with load_a as a, load_b as b: for line in tqdm(zip(a, b), total=len_a): count += 1 # remove trailing spaces line = [s.strip() for s in line] if lowercase: line = [s.lower() for s in line] if len(line[0]) < 1 and len(line[1]) < 1: ignored.append(count) continue sequences.append(line) else: with load_a as a: for line in tqdm(a, total=len_a): count += 1 # remove trailing spaces line = line.strip() if lowercase: line = line.lower() if len(line) < 1: ignored.append(count) continue sequences.append(line) if len(ignored) > 0: print("[Warning] There were", len(ignored), "ignored sequences.") return sequences
def load_file(filepath, formatting, case_sensitive=True, max_train_seq=None): """ Loads text from file. """ sequences = [] count = 0 with open(filepath) as f: for line in tqdm(f, total=get_len(filepath)): if not case_sensitive: line = line.lower() line = line.strip() sequences.append(line) count += 1 if max_train_seq and count > max_train_seq: break print('[Info] Loaded {} sequences from {}'.format(len(sequences), filepath)) return sequences
def get_queryset(self): queryset = Song.objects.raw(get_natural_sort_radio_queryset('song', self.request)) setattr(type(queryset), '__len__', get_len(queryset)) return queryset
def get_queryset(self): queryset = Game.objects.raw(get_natural_sort_artist_queryset(self.request)) setattr(type(queryset), '__len__', get_len(queryset)) return queryset
def main(): opt = parser.parse_args() opt.cuda = opt.gpu > -1 if opt.cuda: torch.cuda.set_device(opt.gpu) translator = onmt.Translator_cnn(opt) srcBatch, tgtBatch = [], [] tgt_id = 0 if opt.tgt == opt.label0 else 1 count = 0 total_correct, total_words, total_loss = 0, 0, 0 outputs, predictions, sents = [], [], [] # load bpe encoder. bpe_enc = bpe_encoder.from_dict(translator.src_dict) bpe_enc.mute() max_seq_length = translator.model_opt.sequence_length for line in tqdm(addone(codecs.open(opt.src, "r", "utf-8")), total=get_len(opt.src)+1): count += 1 if line is not None: sents.append(line) # tokenise. tokens = [f for f in bpe_enc.transform([line])][0] # before = len(tokens) tokens = reclip(line, tokens, bpe_enc, max_seq_length-2) # after = len(tokens) tokens = [SOS] + tokens + [EOS] # print("b, a:", before, after) # add padding. blanks = [Constants.PAD for _ in range(max_seq_length-len(tokens))] tokens = tokens + blanks srcBatch.append(tokens) tgtBatch += [tgt_id] if len(srcBatch) < opt.batch_size: continue else: # at the end of file, check last batch if len(srcBatch) == 0: break num_correct, batchSize, outs, preds = translator.translate_bpe(srcBatch, tgtBatch) total_correct += num_correct.item() total_words += batchSize outputs += outs.data.tolist() predictions += preds.tolist() srcBatch, tgtBatch = [], [] if opt.output: with open(opt.output, "w") as outF: for i in range(len(sents)): outF.write(str(predictions[i]) + "\t" + str(outputs[i]) + "\t" + sents[i]) print('Accuracy: ', str((total_correct*100)/total_words)) print('')
def main(args): utils.import_user_module(args) if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, '--max-sentences/--batch-size cannot be larger than --buffer-size' print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble print('| loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(':'), arg_overrides=eval(args.model_overrides), task=task, ) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Initialize generator generator = task.build_generator(args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def encode_fn(x): if tokenizer is not None: x = tokenizer.encode(x) if bpe is not None: x = bpe.encode(x) return x def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]) # if args.buffer_size > 1: # print('| Sentence buffer size:', args.buffer_size) # print('| Type the input sentence and press return:') start_id = 0 i = 0 with open("sacrebleu_fr.txt", "w") as writer: for inputs in tqdm(buffered_read(args.input, args.buffer_size), total=int(get_len(args.input) / args.buffer_size)): results = [] for batch in make_batches(inputs, args, task, max_positions, encode_fn): src_tokens = batch.src_tokens src_lengths = batch.src_lengths if use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() sample = { 'net_input': { 'src_tokens': src_tokens, 'src_lengths': src_lengths, }, } translations = task.inference_step(generator, models, sample) for i, (id, hypos) in enumerate( zip(batch.ids.tolist(), translations)): src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) results.append((start_id + id, src_tokens_i, hypos)) # sort output to match input order for id, src_tokens, hypos in sorted(results, key=lambda x: x[0]): if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) # print('S-{}\t{}'.format(id, src_str)) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) hypo_str = decode_fn(hypo_str) writer.write(hypo_str + "\n") i += 1 if i % 1000 == 0: torch.cuda.empty_cache() # update running id counter start_id += len(inputs)