예제 #1
0
def test_simple_determinism():
    """Test to check that the extraction of the batches is deterministic."""
    classes = 10
    number = 100000
    epochs = 100
    batch_size = 10000

    x = np.arange(0, number, dtype=np.int64)
    y = np.random.randint(0, classes, size=number)

    ms = MixedSequence(VectorSequence(x, batch_size),
                       VectorSequence(y, batch_size))

    ms2 = MixedSequence(VectorSequence(x, batch_size),
                        VectorSequence(y, batch_size))

    for epoch in range(epochs):
        for step in range(ms.steps_per_epoch):
            xi, yi = ms[step]
            xj, yj = ms2[step]
            if epoch == 0:
                # The first epochs they must be aligned
                assert (xi == xj).all()
                assert (yi == yj).all()
            else:
                # Afterwards, since the ms2 is not shuffled, they must not be
                # anymore. Or at least, is very unlikely.
                assert (xi != xj).any()
                assert (yi != yj).any()
            assert (y[xi] == yi).all()

        ms.on_epoch_end()
예제 #2
0
def test_keras_mixed_sequence():
    model = build_model()
    batch_size = 32
    sequence = MixedSequence(
        VectorSequence(np.random.randint(2, size=(100, 10)), batch_size),
        {
            "output1": VectorSequence(np.random.randint(2, size=(100, 10)), batch_size),
            "output2": VectorSequence(np.random.randint(2, size=(100, 20)), batch_size)
        }
    )
    model.fit(
        sequence,
        steps_per_epoch=sequence.steps_per_epoch,
        epochs=2,
        verbose=0,
        shuffle=True
    )

    X, y = sequence.rasterize()

    model.fit(
        X, y,
        epochs=2,
        verbose=0,
        shuffle=True
    )
def test_change_batch_size():
    batch_size = 512
    sequence = MixedSequence(
        VectorSequence(np.empty(4096), batch_size=batch_size),
        VectorSequence(np.empty(4096), batch_size=batch_size),
    )
    print(sequence[0])
    assert sequence.batch_size == batch_size
    new_batch_size = 32
    sequence.batch_size = new_batch_size
    assert sequence.batch_size == 32
    sequence[sequence.steps_per_epoch-1]
예제 #4
0
def test_genomic_sequence_determinism():
    batch_size = 32
    epochs = 5
    enhancers = pd.read_csv("tests/enhancers.csv")
    promoters = pd.read_csv("tests/promoters.csv")

    genome = Genome("hg19", chromosomes=["chr1"])
    for region in tqdm((enhancers, promoters), desc="Region types"):
        y = np.arange(0, len(region), dtype=np.int64)
        mixed_sequence = MixedSequence(x=BedSequence(genome, region,
                                                     batch_size),
                                       y=VectorSequence(y, batch_size))
        reference_mixed_sequence = MixedSequence(
            x=BedSequence(genome,
                          region,
                          batch_size=len(region),
                          shuffle=False),
            y=VectorSequence(y, batch_size=len(region), shuffle=False))
        X, _ = reference_mixed_sequence[0]
        for _ in trange(epochs, desc="Epochs", leave=False):
            for step in range(mixed_sequence.steps_per_epoch):
                xi, yi = mixed_sequence[step]
                assert (X[yi.astype(int)] == xi).all()
            mixed_sequence.on_epoch_end()
def create_sequence(bed: pd.DataFrame, assembly: Genome,
                    batch_size: int) -> MixedSequence:
    """Return training sequence.

    Parameters
    ----------------------------
    bed: pd.DataFrame,
        Dataframe with bed file structure.
    assembly: Genome,
        Genomic assembly to use.
    batch_size: int,
        Batch size to use.

    Returns
    ----------------------------
    Training sequence for model.
    """
    return MixedSequence(x=BedSequence(assembly=assembly,
                                       bed=bed,
                                       batch_size=batch_size),
                         y=VectorSequence(bed.labels.values.astype(float),
                                          batch_size=batch_size))
예제 #6
0
def test_illegal_parameters_keras_mixed_sequence():
    with pytest.raises(ValueError):
        MixedSequence(
            VectorSequence(np.random.randint(2, size=(100, 10)), 20),
            VectorSequence(np.random.randint(2, size=(100, 10)), 50)
        )

    with pytest.raises(ValueError):
        MixedSequence(
            VectorSequence(np.random.randint(
                2, size=(100, 10)), 50, elapsed_epochs=50),
            VectorSequence(np.random.randint(2, size=(100, 10)), 50)
        )

    with pytest.raises(ValueError):
        MixedSequence(
            VectorSequence(np.random.randint(2, size=(60, 10)), 50),
            VectorSequence(np.random.randint(2, size=(100, 10)), 50)
        )

    with pytest.raises(ValueError):
        VectorSequence(np.random.randint(2, size=(60, 10)), 50)[10000]
예제 #7
0
    def __init__(
        self,
        graph: Graph,
        support: Graph,
        kernel: tf.SparseTensor,
        return_node_types: bool = False,
        return_node_ids: bool = False,
        node_features: Optional[List[np.ndarray]] = None,
        node_type_features: Optional[List[np.ndarray]] = None,
        edge_features: Optional[List[np.ndarray]] = None,
        use_edge_metrics: bool = False,
    ):
        """Create new Open-world assumption GCN training sequence for edge prediction.

        Parameters
        --------------------------------
        graph: Graph,
            The graph from which to sample the edges.
        support: Graph
            The graph to be used for the topological metrics.
        kernel: tf.SparseTensor
            The kernel to be used for the convolutions.
        return_node_types: bool = False
            Whether to return the node types.
        return_edge_types: bool = False
            Whether to return the edge types.
        return_node_ids: bool = False
            Whether to return the node IDs.
            These are needed when an embedding layer is used.
        node_features: List[np.ndarray]
            The node features to be used.
        node_type_features: Optional[List[np.ndarray]]
            The node type features to be used.
            For instance, these could be BERT embeddings of the
            description of the node types.
            When the graph has multilabel node types,
            we will average the features.
        edge_features: Optional[List[np.ndarray]] = None,

        use_edge_metrics: bool = False
            Whether to return the edge metrics.
        """
        super().__init__(
            graph=graph,
            support=support,
            kernel=kernel,
            return_node_types=return_node_types,
            return_edge_types=True,
            return_node_ids=return_node_ids,
            node_features=node_features,
            node_type_features=node_type_features,
            edge_features=edge_features,
            use_edge_metrics=use_edge_metrics,
        )

        self._known_edge_types_mask_sequence = VectorSequence(
            graph.get_known_edge_types_mask().astype(np.float32),
            batch_size=graph.get_number_of_nodes(),
            shuffle=False
        )

        # The index in the returned sequence that contains the
        # edge label is 2 (source and destination nodes).
        self._edge_label_index = 2
    def __init__(
        self,
        graph: Graph,
        support: Graph,
        kernel: tf.SparseTensor,
        return_node_types: bool = False,
        return_edge_types: bool = False,
        return_node_ids: bool = False,
        node_features: Optional[List[np.ndarray]] = None,
        node_type_features: Optional[List[np.ndarray]] = None,
        edge_features: Optional[List[np.ndarray]] = None,
        use_edge_metrics: bool = False,
    ):
        """Create new Open-world assumption GCN training sequence for edge prediction.

        Parameters
        --------------------------------
        graph: Graph,
            The graph from which to sample the edges.
        support: Graph
            The graph to be used for the topological metrics.
        kernel: tf.SparseTensor
            The kernel to be used for the convolutions.
        return_node_types: bool = False
            Whether to return the node types.
        return_edge_types: bool = False
            Whether to return the edge types.
        return_node_ids: bool = False
            Whether to return the node IDs.
            These are needed when an embedding layer is used.
        node_features: List[np.ndarray]
            The node features to be used.
        node_type_features: Optional[List[np.ndarray]]
            The node type features to be used.
            For instance, these could be BERT embeddings of the
            description of the node types.
            When the graph has multilabel node types,
            we will average the features.
        edge_features: Optional[List[np.ndarray]] = None,
            The edge features to be used.
        use_edge_metrics: bool = False
            Whether to return the edge metrics.
        """
        if not graph.has_edges():
            raise ValueError(
                f"An empty instance of graph {graph.get_name()} was provided!"
            )
        self._graph = graph
        self._kernel = kernel
        if node_features is None:
            node_features = []
        self._node_features = node_features
        if return_node_ids:
            self._node_ids = graph.get_node_ids()
        else:
            self._node_ids = None
        
        if return_node_types or node_type_features is not None:
            if graph.has_multilabel_node_types():
                node_types = graph.get_one_hot_encoded_node_types()
            else:
                node_types = graph.get_single_label_node_type_ids()
        
        self._node_types = node_types if return_node_types else None

        if node_type_features is not None:
            if self._graph.has_multilabel_node_types():
                self._node_type_features = []
                minus_node_types = node_type_feature[node_types - 1]
                node_types_mask = node_types==0
                for node_type_feature in self._node_type_features:
                    self._node_type_features.append(np.ma.array(
                        node_type_feature[minus_node_types],
                        mask=np.repeat(
                            node_types_mask,
                            node_type_feature.shape[1]
                        ).reshape((
                            *node_types.shape,
                            node_type_feature.shape[1]
                        ))
                    ).mean(axis=-1).data)
            else:
                if self._graph.has_unknown_node_types():
                    self._node_type_features = []
                    minus_node_types = node_type_feature[node_types - 1]
                    node_types_mask = node_types==0
                    for node_type_feature in self._node_type_features:
                        ntf = node_type_feature[minus_node_types]
                        # Masking the unknown values to zero.
                        ntf[node_types_mask] = 0.0
                        self._node_type_features.append(ntf)
                else:
                    self._node_type_features = [
                        node_type_feature[node_types]
                        for node_type_feature in node_type_features
                    ]
        else:
            self._node_type_features = []

        self._sequence = EdgePredictionSequence(
            graph=graph,
            graph_used_in_training=support,
            return_node_types=False,
            return_edge_types=return_edge_types,
            use_edge_metrics=use_edge_metrics,
            batch_size=support.get_number_of_nodes()
        )

        self._edge_features_sequence = None if edge_features is None else VectorSequence(
            edge_features,
            batch_size=graph.get_number_of_nodes(),
            shuffle=False
        )

        self._use_edge_metrics = use_edge_metrics
        self._current_index = 0
        super().__init__(
            sample_number=graph.get_number_of_directed_edges(),
            batch_size=graph.get_number_of_nodes(),
        )