def test_simple_determinism(): """Test to check that the extraction of the batches is deterministic.""" classes = 10 number = 100000 epochs = 100 batch_size = 10000 x = np.arange(0, number, dtype=np.int64) y = np.random.randint(0, classes, size=number) ms = MixedSequence(VectorSequence(x, batch_size), VectorSequence(y, batch_size)) ms2 = MixedSequence(VectorSequence(x, batch_size), VectorSequence(y, batch_size)) for epoch in range(epochs): for step in range(ms.steps_per_epoch): xi, yi = ms[step] xj, yj = ms2[step] if epoch == 0: # The first epochs they must be aligned assert (xi == xj).all() assert (yi == yj).all() else: # Afterwards, since the ms2 is not shuffled, they must not be # anymore. Or at least, is very unlikely. assert (xi != xj).any() assert (yi != yj).any() assert (y[xi] == yi).all() ms.on_epoch_end()
def test_keras_mixed_sequence(): model = build_model() batch_size = 32 sequence = MixedSequence( VectorSequence(np.random.randint(2, size=(100, 10)), batch_size), { "output1": VectorSequence(np.random.randint(2, size=(100, 10)), batch_size), "output2": VectorSequence(np.random.randint(2, size=(100, 20)), batch_size) } ) model.fit( sequence, steps_per_epoch=sequence.steps_per_epoch, epochs=2, verbose=0, shuffle=True ) X, y = sequence.rasterize() model.fit( X, y, epochs=2, verbose=0, shuffle=True )
def test_change_batch_size(): batch_size = 512 sequence = MixedSequence( VectorSequence(np.empty(4096), batch_size=batch_size), VectorSequence(np.empty(4096), batch_size=batch_size), ) print(sequence[0]) assert sequence.batch_size == batch_size new_batch_size = 32 sequence.batch_size = new_batch_size assert sequence.batch_size == 32 sequence[sequence.steps_per_epoch-1]
def test_genomic_sequence_determinism(): batch_size = 32 epochs = 5 enhancers = pd.read_csv("tests/enhancers.csv") promoters = pd.read_csv("tests/promoters.csv") genome = Genome("hg19", chromosomes=["chr1"]) for region in tqdm((enhancers, promoters), desc="Region types"): y = np.arange(0, len(region), dtype=np.int64) mixed_sequence = MixedSequence(x=BedSequence(genome, region, batch_size), y=VectorSequence(y, batch_size)) reference_mixed_sequence = MixedSequence( x=BedSequence(genome, region, batch_size=len(region), shuffle=False), y=VectorSequence(y, batch_size=len(region), shuffle=False)) X, _ = reference_mixed_sequence[0] for _ in trange(epochs, desc="Epochs", leave=False): for step in range(mixed_sequence.steps_per_epoch): xi, yi = mixed_sequence[step] assert (X[yi.astype(int)] == xi).all() mixed_sequence.on_epoch_end()
def create_sequence(bed: pd.DataFrame, assembly: Genome, batch_size: int) -> MixedSequence: """Return training sequence. Parameters ---------------------------- bed: pd.DataFrame, Dataframe with bed file structure. assembly: Genome, Genomic assembly to use. batch_size: int, Batch size to use. Returns ---------------------------- Training sequence for model. """ return MixedSequence(x=BedSequence(assembly=assembly, bed=bed, batch_size=batch_size), y=VectorSequence(bed.labels.values.astype(float), batch_size=batch_size))
def test_illegal_parameters_keras_mixed_sequence(): with pytest.raises(ValueError): MixedSequence( VectorSequence(np.random.randint(2, size=(100, 10)), 20), VectorSequence(np.random.randint(2, size=(100, 10)), 50) ) with pytest.raises(ValueError): MixedSequence( VectorSequence(np.random.randint( 2, size=(100, 10)), 50, elapsed_epochs=50), VectorSequence(np.random.randint(2, size=(100, 10)), 50) ) with pytest.raises(ValueError): MixedSequence( VectorSequence(np.random.randint(2, size=(60, 10)), 50), VectorSequence(np.random.randint(2, size=(100, 10)), 50) ) with pytest.raises(ValueError): VectorSequence(np.random.randint(2, size=(60, 10)), 50)[10000]
def __init__( self, graph: Graph, support: Graph, kernel: tf.SparseTensor, return_node_types: bool = False, return_node_ids: bool = False, node_features: Optional[List[np.ndarray]] = None, node_type_features: Optional[List[np.ndarray]] = None, edge_features: Optional[List[np.ndarray]] = None, use_edge_metrics: bool = False, ): """Create new Open-world assumption GCN training sequence for edge prediction. Parameters -------------------------------- graph: Graph, The graph from which to sample the edges. support: Graph The graph to be used for the topological metrics. kernel: tf.SparseTensor The kernel to be used for the convolutions. return_node_types: bool = False Whether to return the node types. return_edge_types: bool = False Whether to return the edge types. return_node_ids: bool = False Whether to return the node IDs. These are needed when an embedding layer is used. node_features: List[np.ndarray] The node features to be used. node_type_features: Optional[List[np.ndarray]] The node type features to be used. For instance, these could be BERT embeddings of the description of the node types. When the graph has multilabel node types, we will average the features. edge_features: Optional[List[np.ndarray]] = None, use_edge_metrics: bool = False Whether to return the edge metrics. """ super().__init__( graph=graph, support=support, kernel=kernel, return_node_types=return_node_types, return_edge_types=True, return_node_ids=return_node_ids, node_features=node_features, node_type_features=node_type_features, edge_features=edge_features, use_edge_metrics=use_edge_metrics, ) self._known_edge_types_mask_sequence = VectorSequence( graph.get_known_edge_types_mask().astype(np.float32), batch_size=graph.get_number_of_nodes(), shuffle=False ) # The index in the returned sequence that contains the # edge label is 2 (source and destination nodes). self._edge_label_index = 2
def __init__( self, graph: Graph, support: Graph, kernel: tf.SparseTensor, return_node_types: bool = False, return_edge_types: bool = False, return_node_ids: bool = False, node_features: Optional[List[np.ndarray]] = None, node_type_features: Optional[List[np.ndarray]] = None, edge_features: Optional[List[np.ndarray]] = None, use_edge_metrics: bool = False, ): """Create new Open-world assumption GCN training sequence for edge prediction. Parameters -------------------------------- graph: Graph, The graph from which to sample the edges. support: Graph The graph to be used for the topological metrics. kernel: tf.SparseTensor The kernel to be used for the convolutions. return_node_types: bool = False Whether to return the node types. return_edge_types: bool = False Whether to return the edge types. return_node_ids: bool = False Whether to return the node IDs. These are needed when an embedding layer is used. node_features: List[np.ndarray] The node features to be used. node_type_features: Optional[List[np.ndarray]] The node type features to be used. For instance, these could be BERT embeddings of the description of the node types. When the graph has multilabel node types, we will average the features. edge_features: Optional[List[np.ndarray]] = None, The edge features to be used. use_edge_metrics: bool = False Whether to return the edge metrics. """ if not graph.has_edges(): raise ValueError( f"An empty instance of graph {graph.get_name()} was provided!" ) self._graph = graph self._kernel = kernel if node_features is None: node_features = [] self._node_features = node_features if return_node_ids: self._node_ids = graph.get_node_ids() else: self._node_ids = None if return_node_types or node_type_features is not None: if graph.has_multilabel_node_types(): node_types = graph.get_one_hot_encoded_node_types() else: node_types = graph.get_single_label_node_type_ids() self._node_types = node_types if return_node_types else None if node_type_features is not None: if self._graph.has_multilabel_node_types(): self._node_type_features = [] minus_node_types = node_type_feature[node_types - 1] node_types_mask = node_types==0 for node_type_feature in self._node_type_features: self._node_type_features.append(np.ma.array( node_type_feature[minus_node_types], mask=np.repeat( node_types_mask, node_type_feature.shape[1] ).reshape(( *node_types.shape, node_type_feature.shape[1] )) ).mean(axis=-1).data) else: if self._graph.has_unknown_node_types(): self._node_type_features = [] minus_node_types = node_type_feature[node_types - 1] node_types_mask = node_types==0 for node_type_feature in self._node_type_features: ntf = node_type_feature[minus_node_types] # Masking the unknown values to zero. ntf[node_types_mask] = 0.0 self._node_type_features.append(ntf) else: self._node_type_features = [ node_type_feature[node_types] for node_type_feature in node_type_features ] else: self._node_type_features = [] self._sequence = EdgePredictionSequence( graph=graph, graph_used_in_training=support, return_node_types=False, return_edge_types=return_edge_types, use_edge_metrics=use_edge_metrics, batch_size=support.get_number_of_nodes() ) self._edge_features_sequence = None if edge_features is None else VectorSequence( edge_features, batch_size=graph.get_number_of_nodes(), shuffle=False ) self._use_edge_metrics = use_edge_metrics self._current_index = 0 super().__init__( sample_number=graph.get_number_of_directed_edges(), batch_size=graph.get_number_of_nodes(), )