예제 #1
0
 def benchmark(self):
     model_path = os.path.join(BROOT,
                               f'models/{self.dataset}-{self.model}-2.0')
     valid_data = os.path.join(BROOT,
                               get_valid_data(self.dataset, self.model))
     inference_bin = os.path.join(BROOT, 'build/inference')
     ret = subprocess.run([
         inference_bin,
         '--logtostderr',
         '--model',
         model_path,
         '--data',
         valid_data,
         '--mode',
         mode,
         '--batch_size',
         str(batch_size),
         '--num_labels',
         get_num_labels(self.dataset),
         '--seq_lens',
         str(self.seq_len),
         '--min_graph',
         str(self.args.min_graph),
         '--ignore_copy',
         str(self.args.ignore_copy),
     ],
                          stdout=subprocess.PIPE,
                          stderr=subprocess.PIPE)
     if ret.returncode != 0:
         print(ret.stderr.decode('ascii'))
         assert False, 'Prediction failed.'
     prediction = list()
     for line in ret.stdout.decode('ascii').splitlines():
         if line.startswith('Sents/s'):
             _, qps = line.split()
         else:
             prediction.append(int(line))
     prediction = np.asarray(prediction)
     testcase = os.path.join(BROOT, get_valid_labels(self.dataset))
     labels = read_label(testcase)
     metric = get_metric(self.dataset)
     ret = metric(prediction, labels)
     stat = {'Sents/s': float(qps)}
     stat['metric_value'] = ret
     stat['metric'] = metric.__name__
     stat['batch_size'] = batch_size
     stat['dataset'] = self.dataset
     stat['model'] = self.model + '-2.0'
     stat['mode'] = self.mode
     if self.seq_len == 0:
         stat['seq_len'] = 'dynamic'
     else:
         stat['seq_len'] = self.seq_len
     return stat
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description='Pretraining argument parser')
    parser = load_pretrain_args(parser)
    parser = load_test_args(parser)
    args = parser.parse_args()

    set_seeds(args.seed)

    train_data = get_train_data()
    valid_data = get_valid_data()
    test_data = get_test_data()

    nnet = create_nnet(train_data, args)

    optimizer = Adam(nnet.parameters(), lr=args.lr)
    ce_loss = nn.CrossEntropyLoss()
    mse_loss = nn.MSELoss()

    action_space = ActionSpace()

    tb = SummaryWriter()

    best_score = 0

    for epoch in range(1, args.update_epochs + 1):
        print(f'Epoch {epoch}')

        for indice in random_batch(len(train_data), args.train_batch_size):
            batch = train_data[indice]
            input_batch = to_input_batch(batch, torch.device('cuda'))

            policies, values = nnet(input_batch)

            target_policies = get_target_policies(batch, action_space).cuda()
            target_values = get_target_values(batch).cuda()

            policy_loss = ce_loss(policies, target_policies)
            value_loss = mse_loss(values, target_values)
            loss = policy_loss + value_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        accuracy = test(valid_data, nnet, args, tb, epoch)

        if accuracy > best_score:
            best_score = accuracy
            torch.save(nnet.module.state_dict(), 'models/pretrained.pt')

    nnet.module.load_state_dict(torch.load('models/pretrained.pt'))

    test(test_data, nnet, args, tb, args.update_epochs + 1)
예제 #3
0
def main():
    mp.set_start_method('spawn')
    mpp.Pool.istarmap = istarmap  # for tqdm

    parser = argparse.ArgumentParser(description='Training argument parser')
    parser = load_train_args(parser)
    parser = load_test_args(parser)
    args = parser.parse_args()

    set_seeds(args.seed)

    train_data = get_train_data()
    valid_data = get_valid_data()

    nnet = create_nnet(train_data, args)
    nnet.module.load_state_dict(torch.load(f'models/{args.load}'))
    nnets = create_nnets(train_data, args, n_nnets=torch.cuda.device_count())

    optimizer = Adam(nnet.parameters(), lr=args.lr)
    policy_loss_fn = nn.KLDivLoss(reduction='batchmean')
    value_loss_fn = nn.MSELoss()

    action_space = ActionSpace()

    train_examples = deque(maxlen=args.examples_len)

    tb = SummaryWriter()  # tensorboard writer

    epoch = 0
    while True:
        for indice in random_batch(len(train_data), args.train_batch_size):
            epoch += 1
            print(f'Epoch {epoch}')

            copy_nnet(nnet, nnets)  # nnet -> nnets

            curr_examples = simulate(train_data[indice], nnets, action_space,
                                     args)
            train_examples.extend(curr_examples)

            update_net(train_examples, nnet, optimizer, policy_loss_fn,
                       value_loss_fn, args, tb, epoch)

            test(valid_data, nnet, args, tb, epoch)
                                additional)

        #load the source space
        source_sp = Space.build(source_file, source_words.union(set(lexicon)))

    source_sp.normalize()

    print "Reading: %s" % target_file
    target_sp = Space.build(target_file)
    target_sp.normalize()

    print "Translating"  #translates all the elements loaded in the source space
    mapped_source_sp = apply_tm_model(source_sp, tm)

    print "Retrieving translations"
    test_data = get_valid_data(source_sp, target_sp, test_data)

    #turn test data into a dictionary (a word can have mutiple translation)
    gold = collections.defaultdict(set)
    for k, v in test_data:
        gold[k].add(v)

    score(mapped_source_sp, target_sp, gold, additional)

    print "Printing mapped vectors: %s" % out_file
    np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
    np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")


if __name__ == '__main__':
    main(sys.argv)
예제 #5
0
        lexicon = random.sample(list(lexicon.difference(source_words)), additional)

        # load the source space
        source_sp = Space.build(source_file, source_words.union(set(lexicon)))

    source_sp.normalize()

    print "Reading: %s" % target_file
    target_sp = Space.build(target_file)
    target_sp.normalize()

    print "Translating"  # translates all the elements loaded in the source space
    mapped_source_sp = apply_tm(source_sp, tm)

    print "Retrieving translations"
    test_data = get_valid_data(source_sp, target_sp, test_data)

    # turn test data into a dictionary (a word can have mutiple translation)
    gold = collections.defaultdict(set)
    for k, v in test_data:
        gold[k].add(v)

    score(mapped_source_sp, target_sp, gold, additional)

    print "Printing mapped vectors: %s" % out_file
    np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
    np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")


if __name__ == '__main__':
    main(sys.argv)
예제 #6
0
def main(sys_argv):
    try:
        opts, argv = getopt.getopt(sys_argv[1:], "ho:c:l:m:1:2:t:a:v:", [
            "help", "output=", "correction=", "levenshtein=", "matrix=", "1=",
            "2=", "topK=", "alpha=", "verbosity="
        ])
    except getopt.GetoptError as err:
        print(str(err))
        usage()
        sys.exit(1)

    out_file = "./translated_vecs"
    additional = None
    levcosts = {}
    for opt, val in opts:
        # print(opt+'='+val)
        if opt in ("-o", "--ouput"):
            out_file = val
        elif opt in ("-l", "--levenshtein"):
            levcosts = u.readcosts(val)
        elif opt in ("-m", "--matrix"):
            tm_file = val
        elif opt == '-1':
            source_file = val
        elif opt == '-2':
            target_file = val
        elif opt in ("-c", "--correction"):
            try:
                additional = int(val)
            except ValueError:
                print("additional: %s" % val)
                usage(1)
        elif opt in ("-t", "--topK"):
            try:
                u.topK = int(val)
            except ValueError:
                print("topK: %s" % val)
                usage(1)
        elif opt in ("-v", "--verbosity"):
            try:
                u.verbosity = int(val)
            except ValueError:
                print("verbosity: %s" % val)
                usage(1)
        elif opt in ("-a", "--alpha"):
            try:
                u.alpha = float(val)
            except ValueError:
                print("alpha: %s" % val)
                usage(1)
        elif opt in ("-h", "--help"):
            usage(0)
        else:
            print("Unknown option: -%s %s" % (opt, val))
            usage(1)

    if len(argv) == 1:
        test_file = argv[0]
    else:
        print('Unused arguments:')
        print(argv)
        usage(1)

    #if u.verbosity>0: # always log the parameters in the output
    sys.stdout.write(sys_argv[0] + " ")
    for opt, val in opts:
        sys.stdout.write(opt + " " + val + " ")
    print(test_file)

    if u.verbosity > 1:
        print("Loading the translation matrix %s " % tm_file)
    tm = np.loadtxt(tm_file)

    if u.verbosity > 1:
        print("Reading the test data %s " % test_file)
    test_data = u.read_dict(test_file)

    #in the _source_ space, we only need to load vectors for the words in test.
    #semantic spaces may contain additional words, ALL words in the _target_
    #space are used as the search space
    source_words, _ = zip(*test_data)
    source_words = set(source_words)

    if u.verbosity > 1:
        print("Reading: %s" % source_file)

    if not additional:
        source_sp = Space.build(source_file, source_words)
    else:
        #read all the words in the space
        with io.open(source_file, 'r', encoding='utf8') as f:
            lexicon = set([l.split(' ')[0] for l in f])
        # lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str,
        #                             comments=None, usecols=(0,)).flatten())
        #the max number of additional+test elements is bounded by the size
        #of the lexicon
        additional = min(additional, len(lexicon) - len(source_words))
        #we sample additional elements that are not already in source_words
        random.seed(100)
        if additional > 0:
            lexicon = random.sample(list(lexicon.difference(source_words)),
                                    additional)

        #load the source space
        source_sp = Space.build(source_file, source_words.union(set(lexicon)))

    source_sp.normalize()

    if u.verbosity > 1:
        print("Reading: %s" % target_file)
    target_sp = Space.build(target_file)
    target_sp.normalize()

    if u.verbosity > 1:
        print("Retrieving translations")
    test_data = u.get_valid_data(source_sp, target_sp, test_data)

    #turn test data into a dictionary (a word can have mutiple translation)
    gold = collections.defaultdict(set)
    for k, v in test_data:
        gold[k].add(v)

    if u.verbosity > 1:
        print("Translating"
              )  #translates all the elements loaded in the source space
    source_sp = u.apply_tm(source_sp, tm)

    u.score(source_sp, target_sp, gold, additional, levcosts)
    print("Printing mapped vectors: %s" % out_file)
    np.savetxt("%s.vecs.txt" % out_file, source_sp.mat)
    #    np.savetxt("%s.wds.txt" % out_file, source_sp.id2row, fmt="%s")  # no utf8
    with open("%s.wds.txt" % out_file, "w") as outf:
        for s in source_sp.id2row:
            print(s, file=outf)
예제 #7
0
def main(sys_argv):

    try:
        opts, argv = getopt.getopt(sys_argv[1:], "ho:c:",
                                   ["help", "output=", "correction="])
    except getopt.GetoptError as err:
        print(str(err))
        usage()
        sys.exit(1)

    out_file = "./translated_vecs"
    additional = None
    for opt, val in opts:
        if opt in ("-o", "--ouput"):
            out_file = val
        if opt in ("-c", "--correction"):
            try:
                additional = int(val)
            except ValueError:
                usage(1)
        elif opt in ("-h", "--help"):
            usage(0)
        else:
            usage(1)

    if len(argv) == 4:
        tm_file = argv[0] 
        test_file = argv[1]
        source_file = argv[2]	
        target_file = argv[3]
    else:
#        print(str(err))
        usage(1)

    print("Loading the translation matrix")
    tm = np.loadtxt(tm_file)

    print("Reading the test data")
    test_data = read_dict(test_file)

    #in the _source_ space, we only need to load vectors for the words in test.
    #semantic spaces may contain additional words, ALL words in the _target_ 
    #space are used as the search space
    source_words, _ = zip(*test_data)
    source_words = set(source_words)

    print("Reading: %s" % source_file)
    if not additional:
        source_sp = Space.build(source_file, source_words)
    else:
        #read all the words in the space
        lexicon = set(np.loadtxt(source_file, skiprows=1, dtype=str, 
                                    comments=None, usecols=(0,)).flatten())
        #the max number of additional+test elements is bounded by the size 
        #of the lexicon
        additional = min(additional, len(lexicon) - len(source_words))
        #we sample additional elements that are not already in source_words
        random.seed(100)
        lexicon = random.sample(list(lexicon.difference(source_words)), additional)
        
        #load the source space
        source_sp = Space.build(source_file, source_words.union(set(lexicon)))
    
    source_sp.normalize()

    print("Reading: %s" % target_file)
    target_sp = Space.build(target_file)
    target_sp.normalize()

    print("Translating") #translates all the elements loaded in the source space
    mapped_source_sp = apply_tm(source_sp, tm)
    
    print("Retrieving translations")
    test_data = get_valid_data(source_sp, target_sp, test_data)

    #turn test data into a dictionary (a word can have mutiple translation)
    gold = collections.defaultdict(set)
    for k, v in test_data:
        gold[k].add(v)

    score(mapped_source_sp, target_sp, gold, additional)

    print("Printing mapped vectors: %s" % out_file)
    np.savetxt("%s.vecs.txt" % out_file, mapped_source_sp.mat)
    np.savetxt("%s.wds.txt" % out_file, mapped_source_sp.id2row, fmt="%s")