Пример #1
0
def load_files(src, tgt):
    """
    loads src and tgt files.
    """
    with open(src) as a, open(tgt) as b:
        out = [x for x in tqdm(zip(a, b), total=get_len(src), desc="Files")]
    return [x[0] for x in out], [x[1] for x in out]
Пример #2
0
def load_dataset(reference_doc, hypothesis_doc=None, lowercase=False):
    """
    Since it is often the case that processing of the
    sequences is more expensive than loading the dataset,
    we load the whole datasets in memory first.
    """
    sequences = []
    ignored = []
    count = 0

    len_a = get_len(reference_doc)
    load_a = open(reference_doc, "r")

    if hypothesis_doc:
        len_b = get_len(hypothesis_doc)
        if len_a != len_b:
            print("[Warning] The datasets are not equal in length.")

        load_b = open(hypothesis_doc, "r")
        with load_a as a, load_b as b:
            for line in tqdm(zip(a, b), total=len_a):
                count += 1
                # remove trailing spaces
                line = [s.strip() for s in line]
                if lowercase:
                    line = [s.lower() for s in line]
                if len(line[0]) < 1 and len(line[1]) < 1:
                    ignored.append(count)
                    continue
                sequences.append(line)
    else:
        with load_a as a:
            for line in tqdm(a, total=len_a):
                count += 1
                # remove trailing spaces
                line = line.strip()
                if lowercase:
                    line = line.lower()
                if len(line) < 1:
                    ignored.append(count)
                    continue
                sequences.append(line)

    if len(ignored) > 0:
        print("[Warning] There were", len(ignored), "ignored sequences.")
    return sequences
Пример #3
0
def load_file(filepath, formatting, case_sensitive=True, max_train_seq=None):
    """
    Loads text from file.
    """
    sequences = []

    count = 0
    with open(filepath) as f:
        for line in tqdm(f, total=get_len(filepath)):
            if not case_sensitive:
                line = line.lower()
            line = line.strip()
            sequences.append(line)
            count += 1
            if max_train_seq and count > max_train_seq:
                break

    print('[Info] Loaded {} sequences from {}'.format(len(sequences),
                                                      filepath))
    return sequences
Пример #4
0
 def get_queryset(self):
     queryset = Song.objects.raw(get_natural_sort_radio_queryset('song',
                                                                 self.request))
     setattr(type(queryset), '__len__', get_len(queryset))
     return queryset
Пример #5
0
 def get_queryset(self):
     queryset = Game.objects.raw(get_natural_sort_artist_queryset(self.request))
     setattr(type(queryset), '__len__', get_len(queryset))
     return queryset
Пример #6
0
def main():
    opt = parser.parse_args()
    opt.cuda = opt.gpu > -1
    if opt.cuda:
        torch.cuda.set_device(opt.gpu)

    translator = onmt.Translator_cnn(opt)

    srcBatch, tgtBatch = [], []
    
    tgt_id = 0 if opt.tgt == opt.label0 else 1

    count = 0
    total_correct, total_words, total_loss = 0, 0, 0
    outputs, predictions, sents = [], [], []

    # load bpe encoder.
    bpe_enc = bpe_encoder.from_dict(translator.src_dict)
    bpe_enc.mute()
    
    max_seq_length = translator.model_opt.sequence_length

    for line in tqdm(addone(codecs.open(opt.src, "r", "utf-8")), total=get_len(opt.src)+1):
        count += 1
        if line is not None:
            sents.append(line)
            # tokenise.
            tokens = [f for f in bpe_enc.transform([line])][0]
#             before = len(tokens)
            tokens = reclip(line, tokens, bpe_enc, max_seq_length-2)
#             after = len(tokens)
            tokens = [SOS] + tokens + [EOS]
#             print("b, a:", before, after)
            # add padding.
            blanks = [Constants.PAD for _ in range(max_seq_length-len(tokens))]
            tokens = tokens + blanks

            srcBatch.append(tokens)


            tgtBatch += [tgt_id]

            if len(srcBatch) < opt.batch_size:
                continue
        else:
            # at the end of file, check last batch
            if len(srcBatch) == 0:
                break

        num_correct, batchSize, outs, preds = translator.translate_bpe(srcBatch, tgtBatch)
 
        total_correct += num_correct.item()
        total_words += batchSize
        outputs += outs.data.tolist()
        predictions += preds.tolist()
    

        srcBatch, tgtBatch = [], []
    if opt.output:
        with open(opt.output, "w") as outF:
            for i in range(len(sents)):
                outF.write(str(predictions[i]) + "\t" + str(outputs[i]) + "\t" + sents[i])

    print('Accuracy: ', str((total_correct*100)/total_words))
    print('')
Пример #7
0
def main(args):
    utils.import_user_module(args)

    if args.buffer_size < 1:
        args.buffer_size = 1
    if args.max_tokens is None and args.max_sentences is None:
        args.max_sentences = 1

    assert not args.sampling or args.nbest == args.beam, '--sampling requires --nbest to be equal to --beam'
    assert not args.max_sentences or args.max_sentences <= args.buffer_size, '--max-sentences/--batch-size cannot be larger than --buffer-size'

    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Setup task, e.g., translation
    task = tasks.setup_task(args)

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    models, _model_args = checkpoint_utils.load_model_ensemble(
        args.path.split(':'),
        arg_overrides=eval(args.model_overrides),
        task=task,
    )

    # Set dictionaries
    src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Initialize generator
    generator = task.build_generator(args)

    # Handle tokenization and BPE
    tokenizer = encoders.build_tokenizer(args)
    bpe = encoders.build_bpe(args)

    def encode_fn(x):
        if tokenizer is not None:
            x = tokenizer.encode(x)
        if bpe is not None:
            x = bpe.encode(x)
        return x

    def decode_fn(x):
        if bpe is not None:
            x = bpe.decode(x)
        if tokenizer is not None:
            x = tokenizer.decode(x)
        return x

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    max_positions = utils.resolve_max_positions(
        task.max_positions(), *[model.max_positions() for model in models])

    #     if args.buffer_size > 1:
    #         print('| Sentence buffer size:', args.buffer_size)
    #     print('| Type the input sentence and press return:')
    start_id = 0
    i = 0
    with open("sacrebleu_fr.txt", "w") as writer:
        for inputs in tqdm(buffered_read(args.input, args.buffer_size),
                           total=int(get_len(args.input) / args.buffer_size)):
            results = []
            for batch in make_batches(inputs, args, task, max_positions,
                                      encode_fn):
                src_tokens = batch.src_tokens
                src_lengths = batch.src_lengths
                if use_cuda:
                    src_tokens = src_tokens.cuda()
                    src_lengths = src_lengths.cuda()

                sample = {
                    'net_input': {
                        'src_tokens': src_tokens,
                        'src_lengths': src_lengths,
                    },
                }
                translations = task.inference_step(generator, models, sample)
                for i, (id, hypos) in enumerate(
                        zip(batch.ids.tolist(), translations)):
                    src_tokens_i = utils.strip_pad(src_tokens[i],
                                                   tgt_dict.pad())
                    results.append((start_id + id, src_tokens_i, hypos))

            # sort output to match input order

            for id, src_tokens, hypos in sorted(results, key=lambda x: x[0]):
                if src_dict is not None:
                    src_str = src_dict.string(src_tokens, args.remove_bpe)
    #                 print('S-{}\t{}'.format(id, src_str))

    # Process top predictions
                for hypo in hypos[:min(len(hypos), args.nbest)]:
                    hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                        hypo_tokens=hypo['tokens'].int().cpu(),
                        src_str=src_str,
                        alignment=hypo['alignment'].int().cpu()
                        if hypo['alignment'] is not None else None,
                        align_dict=align_dict,
                        tgt_dict=tgt_dict,
                        remove_bpe=args.remove_bpe,
                    )
                    hypo_str = decode_fn(hypo_str)
                    writer.write(hypo_str + "\n")
                    i += 1
                    if i % 1000 == 0:
                        torch.cuda.empty_cache()

        # update running id counter
        start_id += len(inputs)