예제 #1
0
    def test_eval(self):
        entries = [Environment(
            {"ground_truth": "y = x + 1"},
            set(["ground_truth"])
        )]
        dataset = ListDataset(entries)
        d = get_samples(dataset, MockParser())
        aencoder = ActionSequenceEncoder(d, 0)
        action_sequence = GroundTruthToActionSequence(MockParser())(
            "y = x + 1"
        )
        transform = AddPreviousActions(aencoder)
        prev_action_tensor = transform(
            reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")],
            action_sequence=action_sequence,
            train=False
        )

        assert np.array_equal(
            [
                [2, -1, -1], [3, -1, -1], [4, -1, -1], [-1, 1, -1],
                [1, -1, -1], [5, -1, -1], [-1, 2, -1], [1, -1, -1],
                [4, -1, -1], [-1, 3, -1], [1, -1, -1], [6, -1, -1],
                [-1, 4, -1], [1, -1, -1]
            ],
            prev_action_tensor.numpy()
        )
예제 #2
0
 def test_simple_case(self):
     entries = [Environment(
         {"ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParser())
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParser())(
         ground_truth="y = x + 1"
     )
     transform = EncodeActionSequence(aencoder)
     ground_truth = transform(
         action_sequence=action_sequence,
         reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")],
     )
     assert np.array_equal(
         [
             [3, -1, -1], [4, -1, -1], [-1, 1, -1], [1, -1, -1],
             [5, -1, -1], [-1, 2, -1], [1, -1, -1], [4, -1, -1],
             [-1, 3, -1], [1, -1, -1], [6, -1, -1], [-1, 4, -1],
             [1, -1, -1]
         ],
         ground_truth.numpy()
     )
예제 #3
0
 def test_eval(self):
     entries = [Environment(
         {"text_query": "ab test", "ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParserWithoutVariadicArgs())
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())(
         "y = x + 1"
     )
     transform = AddActionSequenceAsTree(aencoder,)
     matrix, depth = transform(
         reference=[Token(None, "ab", "ab"), Token(None, "test", "test")],
         action_sequence=action_sequence,
         train=False
     )
     assert np.array_equal(
         [0, 1, 2, 3, 2, 3, 3, 4, 3, 4],
         depth.numpy()
     )
     assert np.array_equal(
         [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
          [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 1, 1, 0, 1, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
         matrix.numpy()
     )
예제 #4
0
 def test_eval(self):
     entries = [Environment(
         {"text_query": "ab test", "ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParserWithoutVariadicArgs())
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())(
         "y = x + 1"
     )
     transform = AddQueryForTreeGenDecoder(aencoder, 3,)
     query = transform(
         reference=[Token(None, "ab", "ab"), Token(None, "test", "test")],
         action_sequence=action_sequence,
         train=False
     )
     assert np.array_equal(
         [
             [-1, -1, -1], [2, -1, -1], [3, 2, -1], [4, 3, 2],
             [3, 2, -1], [5, 3, 2], [5, 3, 2], [4, 5, 3],
             [5, 3, 2], [6, 5, 3]
         ],
         query.numpy()
     )
예제 #5
0
 def test_n_dependent(self):
     entries = [Environment(
         {"text_query": "ab test", "ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParserWithoutVariadicArgs())
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())(
         "y = x + 1"
     )
     transform = AddPreviousActionRules(aencoder, 2, n_dependent=3)
     prev_rule_action = transform(
         reference=[Token(None, "ab", "ab"), Token(None, "test", "test")],
         action_sequence=action_sequence,
         train=False,
     )
     assert np.array_equal(
         [
             # str -> "y"
             [[-1, -1, -1], [-1, 3, -1], [-1, -1, -1]],
             # Number -> number
             [[8, -1, -1], [9, -1, -1], [-1, -1, -1]],
             [[-1, -1, -1], [-1, 4, -1], [-1, -1, -1]],
         ],
         prev_rule_action.numpy()
     )
예제 #6
0
def download(cache_path: str = os.path.join(DEFAULT_CACHE_DIR, "django.pt"),
             base_path: str = BASE_PATH,
             get: Callable[[str], str] = default_get,
             num_train: int = 16000, num_test: int = 1000) \
        -> Dict[str, ListDataset]:

    @file_cache(cache_path)
    def _download():
        return {
            "annotation": format_annotations(
                get(BASE_PATH + "all.anno").split("\n")),
            "code": get(BASE_PATH + "all.code").split("\n")
        }
    data = _download()
    annotation = data["annotation"]
    code = data["code"]

    def to_sample(elem: Tuple[str, str]) -> Environment:
        anno, code = elem
        return Environment(
            {
                "text_query": anno,
                "ground_truth": code
            },
            set(["ground_truth"])
        )
    samples = list(map(to_sample, zip(annotation, code)))

    data = {
        "train": samples[:num_train],
        "test": samples[num_train:num_train + num_test],
        "valid": samples[num_train + num_test:]
    }

    return {key: ListDataset(value) for key, value in data.items()}
예제 #7
0
    def test_eval(self):
        entries = [Environment(
            {"text_query": "foo bar", "ground_truth": "y = x + 1"},
            set(["ground_truth"])
        )]
        dataset = ListDataset(entries)
        d = get_samples(dataset, MockParser())
        aencoder = ActionSequenceEncoder(d, 0)
        action_sequence = GroundTruthToActionSequence(MockParser())(
            "y = x + 1"
        )
        transform = AddActions(aencoder)
        action_tensor = transform(
            reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")],
            action_sequence=action_sequence,
            train=False
        )

        assert np.array_equal(
            [
                [2, 2, 0], [4, 3, 1], [6, 4, 2], [6, 4, 2], [5, 3, 1],
                [6, 5, 5], [6, 5, 5], [5, 5, 5], [6, 4, 8], [6, 4, 8],
                [5, 5, 5], [9, 6, 11], [9, 6, 11], [-1, -1, -1]
            ],
            action_tensor.numpy()
        )
예제 #8
0
def dataset():
    return ListDataset([
        Environment({
            "query": "query",
            "ground_truth": "name0"
        }, set(["ground_truth"]))
    ])
예제 #9
0
    def test_multiprocess(self):
        accuracy = use_environment(
            Accuracy(), in_keys=["actual", ["ground_truth", "expected"]],
            value_key="actual"
        )
        dataset = ListDataset([
            Environment(
                {"query": "query0", "ground_truth": "c0"},
                set(["ground_truth"])
            ),
            Environment(
                {"query": "query1", "ground_truth": "c0"},
                set(["ground_truth"])
            ),
            Environment(
                {"query": "query2", "ground_truth": "c0"},
                set(["ground_truth"])
            ),
        ])

        with tempfile.TemporaryDirectory() as init_dir:
            with context.Pool(2) as pool:
                procs = []
                for i in range(2):
                    p = pool.apply_async(
                        self._run,
                        args=(init_dir, dataset, {"accuracy": accuracy}, i),
                    )
                    procs.append(p)
                out = [p.get() for p in procs]
        r0 = out[0]
        r1 = out[1]

        assert r0 == r1

        results = r0
        assert results.metrics == {1: {"accuracy": 1.0 / 3},
                                   3: {"accuracy": 2.0 / 3}}
        assert 3 == len(results.results)
        results.results[0].time = 0.0
        results.results[1].time = 0.0
        results.results[2].time = 0.0
        results.results.sort(key=lambda x: x.sample["query"])
        assert Result({"query": "query0",
                       "ground_truth": "c0"},
                      ["c0", "c1", "c2"],
                      {1: {"accuracy": 1.0}, 3: {"accuracy": 1.0}},
                      True, 0.0) == results.results[0]
        assert Result({"query": "query1",
                       "ground_truth": "c0"},
                      ["c2", "c3", "c0"],
                      {1: {"accuracy": 0.0}, 3: {"accuracy": 1.0}},
                      True, 0.0) == results.results[1]
        assert Result({"query": "query2",
                       "ground_truth": "c0"},
                      ["c2", "c3", "c5"],
                      {1: {"accuracy": 0.0}, 3: {"accuracy": 0.0}},
                      True, 0.0) == results.results[2]
예제 #10
0
def download(cache_path: str = os.path.join(
    DEFAULT_CACHE_DIR, "nl2bash.pt")) -> Dict[str, ListDataset]:
    @file_cache(cache_path)
    def _download():
        logger.info("Download nl2bash dataset")
        with tempfile.TemporaryDirectory() as tmpdir:
            with open(os.path.join(tmpdir, "download.bash"), "w") as file:
                file.write("""
#! /bin/env bash
tmpdir=$1

python -m venv $tmpdir/env
source $tmpdir/env/bin/activate

git clone --depth 1 https://github.com/TellinaTool/nl2bash $tmpdir/nl2bash
pip install tensorflow
pip install -r $tmpdir/nl2bash/requirements.txt
make -C $tmpdir/nl2bash
set +u
export PYTHONPATH=$tmpdir/nl2bash:$PYTHONPATH
set -u
make -C $tmpdir/nl2bash/scripts data
""")
            subprocess.run(
                ["bash", os.path.join(tmpdir, "download.bash"), tmpdir])

            def load(name: str):
                with open(os.path.join(tmpdir, "nl2bash", "data", "bash",
                                       f"{name}.nl.filtered"),
                          encoding="utf-8") as file:
                    inputs = list(file.readlines())
                with open(os.path.join(tmpdir, "nl2bash", "data", "bash",
                                       f"{name}.cm.filtered"),
                          encoding="utf-8") as file:
                    ground_truths = list(file.readlines())
                return [
                    Environment(
                        {
                            "text_query": input,
                            "ground_truth": ground_truth
                        }, set(["ground_truth"]))
                    for input, ground_truth in zip(inputs, ground_truths)
                ]

            dataset = {}
            dataset["train"] = load("train")
            dataset["test"] = load("dev")
            dataset["valid"] = load("test")
        return dataset

    dataset = _download()
    return {key: ListDataset(value) for key, value in dataset.items()}
예제 #11
0
 def __init__(self,
              dataset: torch.utils.data.Dataset,
              synthesizer: Synthesizer[Environment, Code],
              metrics: Mapping[str, Callable[[Environment, Code], float]],
              top_n: List[int] = [1, 3],
              n_samples: Optional[int] = None):
     super().__init__()
     self.dataset = dataset
     if n_samples is not None:
         self.dataset = ListDataset(
             [self.dataset[i] for i in range(n_samples)])
     self.synthesizer = synthesizer
     self.metrics = metrics
     self.top_n = top_n
예제 #12
0
 def test_impossible_case(self):
     entries = [Environment(
         {"ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParser())
     d.tokens = [("", "y"), ("", "1")]
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParser())(
         ground_truth="y = x + 1"
     )
     transform = EncodeActionSequence(aencoder)
     with pytest.raises(RuntimeError):
         transform(
             reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")],
             action_sequence=action_sequence,
         )
예제 #13
0
    def test_simple_case(self):
        accuracy = use_environment(
            Accuracy(), in_keys=["actual", ["ground_truth", "expected"]],
            value_key="actual"
        )
        dataset = ListDataset([
            Environment(
                {"query": "query0", "ground_truth": "c0"},
                set(["ground_truth"])
            ),
            Environment(
                {"query": "query1", "ground_truth": "c0"},
                set(["ground_truth"])
            ),
            Environment(
                {"query": "query2", "ground_truth": "c0"},
                set(["ground_truth"])
            ),
        ])
        results = EvaluateSynthesizer(dataset, synthesize,
                                      metrics={"accuracy": accuracy})()

        assert results.metrics == \
            {1: {"accuracy": 1.0 / 3.0}, 3: {"accuracy": 2.0 / 3.0}}
        assert 3 == len(results.results)
        results.results[0].time = 0.0
        results.results[1].time = 0.0
        results.results[2].time = 0.0
        assert Result({"query": "query0",
                       "ground_truth": "c0"},
                      ["c0", "c1", "c2"],
                      {1: {"accuracy": 1.0}, 3: {"accuracy": 1.0}},
                      True, 0.0) == results.results[0]
        assert Result({"query": "query1",
                       "ground_truth": "c0"},
                      ["c2", "c3", "c0"],
                      {1: {"accuracy": 0.0}, 3: {"accuracy": 1.0}},
                      True, 0.0) == results.results[1]
        assert Result({"query": "query2",
                       "ground_truth": "c0"},
                      ["c2", "c3", "c5"],
                      {1: {"accuracy": 0.0}, 3: {"accuracy": 0.0}},
                      True, 0.0) == results.results[2]
예제 #14
0
 def test_eval(self):
     entries = [Environment(
         {"text_query": "ab test", "ground_truth": "y = x + 1"},
         set(["ground_truth"])
     )]
     dataset = ListDataset(entries)
     d = get_samples(dataset, MockParserWithoutVariadicArgs())
     aencoder = ActionSequenceEncoder(d, 0)
     action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())(
         "y = x + 1"
     )
     transform = AddPreviousActionRules(aencoder, 2)
     prev_rule_action = transform(
         reference=[Token(None, "ab", "ab"), Token(None, "test", "test")],
         action_sequence=action_sequence,
         train=False
     )
     assert np.array_equal(
         [
             # None -> Root
             [[1, -1, -1], [2, -1, -1], [-1, -1, -1]],
             # Assign -> Name, expr
             [[3, -1, -1], [4, -1, -1], [5, -1, -1]],
             # Name -> str
             [[4, -1, -1], [6, -1, -1], [-1, -1, -1]],
             # str -> "x"
             [[-1, -1, -1], [-1, 1, -1], [-1, -1, -1]],
             # Op -> str, expr, expr
             [[7, -1, -1], [6, -1, -1], [5, -1, -1]],
             # str -> "+"
             [[-1, -1, -1], [-1, 2, -1], [-1, -1, -1]],
             # Name -> str
             [[4, -1, -1], [6, -1, -1], [-1, -1, -1]],
             # str -> "y"
             [[-1, -1, -1], [-1, 3, -1], [-1, -1, -1]],
             # Number -> number
             [[8, -1, -1], [9, -1, -1], [-1, -1, -1]],
             [[-1, -1, -1], [-1, 4, -1], [-1, -1, -1]],
         ],
         prev_rule_action.numpy()
     )
예제 #15
0
def download(cache_path: str = os.path.join(DEFAULT_CACHE_DIR, "deepfix.pt"),
             path: str = BASE_PATH,
             get: Callable[[str, str], None] = default_get) \
        -> ListDataset:
    @file_cache(cache_path)
    def _download():
        logger.info("Download DeepFix dataset")
        logger.debug(f"Dataset path: {path}")
        samples = []
        with tempfile.TemporaryDirectory() as tmpdir:
            dst = os.path.join(tmpdir, "dataset.zip")
            get(path, dst)

            gzipfile = os.path.join(tmpdir, "dataset.gz")
            with zipfile.ZipFile(dst) as z:
                with z.open(os.path.join("prutor-deepfix-09-12-2017",
                                         "prutor-deepfix-09-12-2017.db.gz"),
                            "r") as file, \
                        open(gzipfile, "wb") as dst_file:
                    copyfileobj(file, dst_file)
            sqlitefile = os.path.join(tmpdir, "dataset.db")
            with gzip.open(gzipfile, "rb") as src_file, \
                    open(sqlitefile, "wb") as dst_file:
                copyfileobj(src_file, dst_file)

            conn = sqlite3.connect(sqlitefile)
            c = conn.cursor()
            for code, error, errorcount in \
                    c.execute("SELECT code, error, errorcount FROM Code"):
                samples.append(
                    Environment(
                        {
                            "code": code,
                            "error": error,
                            "n_error": errorcount,
                        }, set(["error", "n_error"])))
        return samples

    samples = _download()
    return ListDataset(samples)
예제 #16
0
def dataset():
    return ListDataset([
        Environment(
            {
                "value": torch.tensor(0),
                "ground_truth": torch.tensor(0)
            },
            set(["ground_truth"]),
        ),
        Environment(
            {
                "value": torch.tensor(1),
                "ground_truth": torch.tensor(1)
            },
            set(["ground_truth"]),
        ),
        Environment(
            {
                "value": torch.tensor(2),
                "ground_truth": torch.tensor(2)
            },
            set(["ground_truth"]),
        ),
    ])
예제 #17
0
 def test_map_style_dataset(self):
     dataset = transform(ListDataset([0]), lambda x: x + 1)
     assert 1 == len(dataset)
     assert 1 == dataset[0]
     assert [1] == dataset[:1]
예제 #18
0
    def pretrain(self, output_dir):
        dataset = Dataset(4, 1, 2, 1, 45, seed=0)
        """
        """
        train_dataset = ListDataset([
            Environment(
                {"ground_truth": Circle(1)},
                set(["ground_truth"]),
            ),
            Environment(
                {"ground_truth": Rectangle(1, 2)},
                set(["ground_truth"]),
            ),
            Environment(
                {"ground_truth": Rectangle(1, 1)},
                set(["ground_truth"]),
            ),
            Environment(
                {"ground_truth": Rotation(45, Rectangle(1, 1))},
                set(["ground_truth"]),
            ),
            Environment(
                {"ground_truth": Translation(1, 1, Rectangle(1, 1))},
                set(["ground_truth"]),
            ),
            Environment(
                {"ground_truth": Difference(Circle(1), Circle(1))},
                set(["ground_truth"]),
            ),
            Environment(
                {"ground_truth": Union(Rectangle(1, 2), Circle(1))},
                set(["ground_truth"]),
            ),
            Environment(
                {"ground_truth": Difference(Rectangle(1, 1), Circle(1))},
                set(["ground_truth"]),
            ),
        ])

        with tempfile.TemporaryDirectory() as tmpdir:
            interpreter = self.interpreter()
            train_dataset = data_transform(
                train_dataset,
                Apply(
                    module=AddTestCases(interpreter),
                    in_keys=["ground_truth"],
                    out_key="test_cases",
                    is_out_supervision=False,
                ))
            encoder = self.prepare_encoder(dataset, Parser())

            collate = Collate(
                test_case_tensor=CollateOptions(False, 0, 0),
                variables_tensor=CollateOptions(True, 0, 0),
                previous_actions=CollateOptions(True, 0, -1),
                hidden_state=CollateOptions(False, 0, 0),
                state=CollateOptions(False, 0, 0),
                ground_truth_actions=CollateOptions(True, 0, -1)
            )
            collate_fn = Sequence(OrderedDict([
                ("to_episode", Map(self.to_episode(encoder,
                                                   interpreter))),
                ("flatten", Flatten()),
                ("transform", Map(self.transform(
                    encoder, interpreter, Parser()))),
                ("collate", collate.collate)
            ]))

            model = self.prepare_model(encoder)
            optimizer = self.prepare_optimizer(model)
            train_supervised(
                tmpdir, output_dir,
                train_dataset, model, optimizer,
                torch.nn.Sequential(OrderedDict([
                    ("loss",
                     Apply(
                         module=Loss(
                             reduction="sum",
                         ),
                         in_keys=[
                             "rule_probs",
                             "token_probs",
                             "reference_probs",
                             "ground_truth_actions",
                         ],
                         out_key="action_sequence_loss",
                     )),
                    ("normalize",  # divided by batch_size
                     Apply(
                         [("action_sequence_loss", "lhs")],
                         "loss",
                         mlprogram.nn.Function(Div()),
                         constants={"rhs": 1})),
                    ("pick",
                     mlprogram.nn.Function(
                         Pick("loss")))
                ])),
                None, "score",
                collate_fn,
                1, Epoch(100), evaluation_interval=Epoch(10),
                snapshot_interval=Epoch(100)
            )
        return encoder, train_dataset