Exemplo n.º 1
0
    def PCFP(self):

        # 1. Open a new sub directory for model-to-be-trained
        ext_model_location = os.path.join(self.model_location,
                                          'Extrinsic_grid_completion')
        if not os.path.isdir(self.model_location):
            os.makedirs(self.model_location)

        # 1.5 Initialize the model
        extrinsic_inflection = Inflector(ext_model_location,
                                         data_format=dataloader.DataFormat.MT,
                                         extrinsic=True)
        extrinsic_inflection.train = os.path.join(self.model_location,
                                                  'ext_train.tsv')
        extrinsic_inflection.dev = os.path.join(self.model_location,
                                                'ext_dev.tsv')

        # 1.75. Write out the initial grid for debugging
        init_grid_file = open(
            os.path.join(self.model_location, 'initial_grid.txt'), 'w')
        for row_idx in range(self.r):
            printline = []
            for col_idx in range(self.c):
                if self.wf_grid[row_idx][col_idx] == None:
                    printline.append('<<{}>>'.format(
                        self.wf_grid[row_idx][col_idx]))
                else:
                    printline.append(self.wf_grid[row_idx][col_idx])
            init_grid_file.write('{}\n'.format('\t'.join(printline)))
        init_grid_file.close()
        stderr.write('Finished writing out initial grid.\n')

        # 2. Write out all train and dev instances
        train_file, dev_file = open(extrinsic_inflection.train,
                                    'w'), open(extrinsic_inflection.dev, 'w')
        for row_idx in range(self.r):
            row, wfs, col_idxs = self.get_row(row_idx)
            if len(col_idxs) > 1:
                for trg_col_idx in col_idxs:
                    trg_wf = row[trg_col_idx]
                    src_col_idxs = list(x for x in col_idxs
                                        if x != trg_col_idx)
                    for src_col_idx in src_col_idxs:
                        src_wf = row[src_col_idx]
                        instance = '<' + str(
                            src_col_idx) + '>' + ' ' + ' '.join(
                                list(src_wf)) + ' ' + '<' + str(
                                    trg_col_idx) + '>' + '\t' + ' '.join(
                                        list(trg_wf))
                        if random.choice(range(10)) == 4:
                            dev_file.write('{}\n'.format(instance))
                        else:
                            train_file.write('{}\n'.format(instance))
        train_file.close()
        dev_file.close()

        # 3. Run the model from scratch.. don't return anything
        extrinsic_inflection.patience = 12
        trained_model = seq2seq_runner.run(extrinsic_inflection)

        # 4. Read in Dev predictions and rank best source cells for each target cell
        error = False
        trg_2_src_acc = dict((trg_col_idx,
                              dict((src_col_idx, [0, 1])
                                   for src_col_idx in range(self.c)
                                   if src_col_idx != trg_col_idx))
                             for trg_col_idx in range(self.c))
        preds = os.path.join(ext_model_location, 'predictions_dev.txt')
        for line in open(preds):
            line = line.strip()
            if line.startswith('SRC: '):
                src_col_idx = int(line.split('<', 1)[1].split('>', 1)[0])
                trg_col_idx = int(line.split('<')[-1].split('>')[0])
                trg_2_src_acc[trg_col_idx][src_col_idx][1] += 1
                if not error:
                    trg_2_src_acc[trg_col_idx][src_col_idx][0] += 1

            if '*ERROR*' in line:
                error = True
            else:
                error = False
        for trg_col_idx in trg_2_src_acc:
            for src_col_idx in trg_2_src_acc[trg_col_idx]:
                trg_2_src_acc[trg_col_idx][
                    src_col_idx] = trg_2_src_acc[trg_col_idx][src_col_idx][
                        0] / trg_2_src_acc[trg_col_idx][src_col_idx][1]

        trg_2_best_srcs = dict((trg_col_idx, list(trg_2_src_acc[trg_col_idx]))
                               for trg_col_idx in range(self.c))
        for trg_col_idx in trg_2_best_srcs:
            trg_2_best_srcs[trg_col_idx].sort(
                key=lambda x: trg_2_src_acc[trg_col_idx][x], reverse=True)
            stderr.write('Best Predictors for cell {}:\n'.format(trg_col_idx))
            for best_src in trg_2_best_srcs[trg_col_idx]:
                stderr.write('\t{} ({})\n'.format(
                    best_src, trg_2_src_acc[trg_col_idx][best_src]))

        # 5. Write out test set trying to predict each unattested cell from its best available predictor
        ext_model_location = os.path.join(self.model_location,
                                          'Extrinsic_grid_completion_final')
        if not os.path.isdir(self.model_location):
            os.makedirs(self.model_location)
        extrinsic_inflection = Inflector(ext_model_location,
                                         data_format=dataloader.DataFormat.MT,
                                         extrinsic=True)
        extrinsic_inflection.train = None
        extrinsic_inflection.dev = None
        extrinsic_inflection.test = os.path.join(self.model_location,
                                                 'ext_test.tsv')
        extrinsic_inflection.checkpoint_to_restore = trained_model.best_checkpoint_path

        empty_slots = []
        test_file = open(extrinsic_inflection.test, 'w')
        for row_idx in range(self.r):
            row, wfs, col_idxs = self.get_row(row_idx)
            if None in row and len(col_idxs) > 0:
                for trg_col_idx in range(self.c):
                    if trg_col_idx not in col_idxs:
                        # Make one prediction for every empty cell in grid
                        if self.baseline == 'random_src':
                            src_col_idx = random.choice(col_idxs)
                        else:
                            for src_col_idx in trg_2_best_srcs[trg_col_idx]:
                                if src_col_idx in col_idxs:
                                    break
                        src_wf = row[src_col_idx]
                        instance = '<' + str(
                            src_col_idx) + '>' + ' ' + ' '.join(
                                list(src_wf)) + ' ' + '<' + str(
                                    trg_col_idx) + '>' + '\tPredictMe'
                        test_file.write('{}\n'.format(instance))
                        empty_slots.append((row_idx, trg_col_idx))
        test_file.close()

        # 6. Continue training the model on dev for one epoch and make predictions on test
        _ = seq2seq_runner.run(extrinsic_inflection)

        # 7. Parse the predictions file
        full_grid = np.array(self.wf_grid)
        preds = os.path.join(ext_model_location, 'predictions_test.txt')
        for line in open(preds):
            line = line.strip()
            if line.startswith('PRD:'):
                row_idx, col_idx = empty_slots.pop(0)
                pred = ''.join(line.split(':', 1)[1].split()).replace('_', ' ')
                assert self.wf_grid[row_idx][col_idx] == None
                full_grid[row_idx][col_idx] = pred

        # 8. Write out the completed grid for debugging
        full_grid_file = open(
            os.path.join(ext_model_location, 'pred_full_grid.txt'), 'w')
        for row_idx in range(self.r):
            printline = []
            for col_idx in range(self.c):
                if self.wf_grid[row_idx][col_idx] == None:
                    printline.append('<<{}>>'.format(
                        full_grid[row_idx][col_idx]))
                else:
                    printline.append(full_grid[row_idx][col_idx])
            full_grid_file.write('{}\n'.format('\t'.join(printline)))
        full_grid_file.close()

        if len(empty_slots) != 0:
            raise Exception(
                '{}\n\nHow did test instances and predictions get misaligned!?\n\t{}\n\t{}'
                .format(str(full_grid), len(empty_slots),
                        '\n\t'.join(list(str(x) for x in empty_slots))))

        return full_grid
Exemplo n.º 2
0
 if to_train:
     augmentation_params = params.get("augmentation")
     if augmentation_params is not None:
         suffix = int(augmentation_params["n"])
         generation_params = augmentation_params.get("params", dict())
         augment_file = "augmented/{}-{}-{}".format(language, mode, suffix)
         if os.path.exists(augment_file) and generation_params.get("to_load", True):
             auxiliary_data = read_infile(augment_file)
         else:
             gen_params = copy.copy(generation_params)
             gen_params.pop("to_load")
             auxiliary_data = generate_auxiliary(data, dev_data, suffix, augment_lm_file,
                                                 augment_file, **gen_params)
     else:
         auxiliary_data = None
     inflector.train(data, dev_data=dev_data, augmented_data=auxiliary_data, save_file=save_file)
 if use_paradigms:
     paradigm_checker = ParadigmChecker().train(data)
 if to_rerank_with_lm:
     forward_save_file = "language_models/{}-{}.json".format(language, mode)
     forward_lm = load_lm(forward_save_file) if os.path.exists(forward_save_file) else None
     reverse_save_file = "language_models/reverse-{}-{}.json".format(language, mode)
     reverse_lm = load_lm(reverse_save_file) if os.path.exists(reverse_save_file) else None
     lm_ranker = LmRanker(forward_lm, reverse_lm, to_rerank=(to_rerank_with_lm == "rerank"))
 if to_test:
     alignment_data = [elem[:2] for elem in data]
     # inflector.evaluate(test_data[:20], alignment_data=alignment_data)
     # sys.exit()
     answer = inflector.predict(test_data, **params["predict"])
     # if use_paradigms:
     #     data_to_filter = [(elem[0], elem[2]) for elem in test_data]