def test_chazutsu(self): path = os.path.join(os.path.dirname(__file__), "../") storage = Storage(path) r = chazutsu.datasets.DUC2004().download(storage.data_path("raw")) df = storage.chazutsu(r.root).data() print(df.head(5)) shutil.rmtree(r.root)
def test_word_vector_resource(self): path = os.path.join(os.path.dirname(__file__), "./") storage = Storage(path) vocab = Vocabulary() vocab.set(["you", "loaded", "word", "vector", "now"]) vector_size = 50 word2vec = [ "you " + " ".join(["0"] * vector_size), "word " + " ".join(["1"] * vector_size), "now " + " ".join(["2"] * vector_size), ] word2vec_file = Path(storage.data_path("external/word2vec_dummyr.txt")) with word2vec_file.open(mode="w", encoding="utf-8") as f: f.write("\n".join(word2vec)) wv = WordVector(word2vec_file) key_vector = wv.load() for k in key_vector: self.assertTrue(k in vocab.get()) self.assertEqual(len(key_vector[k]), vector_size) embed = vocab.make_embedding(word2vec_file) self.assertEqual(embed.shape, (len(vocab.get()), vector_size))
def test_read_file(self): path = os.path.join(os.path.dirname(__file__), "../") storage = Storage(path) csv = DataFile(storage.data_path("raw/sample_dataset.csv")) content = csv.to_array() fetched = list(csv.fetch(progress=True)) for c, f in zip(content, fetched): self.assertEqual(c, f)
def test_convert(self): path = os.path.join(os.path.dirname(__file__), "../") storage = Storage(path) csv = DataFile(storage.data_path("raw/sample_dataset.csv")) path_changed = csv.convert(data_dir_to="interim") correct = os.path.join(path, "./data/interim/sample_dataset.csv") self.assertEqual(resolve(path_changed.path), resolve(correct)) attr_added = csv.convert(add_attribute="preprocessed") correct = storage.data_path("raw/sample_dataset__preprocessed.csv") self.assertEqual(resolve(attr_added.path), resolve(correct)) attr_converted = attr_added.convert( attribute_to={"preprocessed": "converted"}) correct = storage.data_path("raw/sample_dataset__converted.csv") self.assertEqual(resolve(attr_converted.path), resolve(correct)) ext_changed = csv.convert(ext_to=".txt") correct = storage.data_path("raw/sample_dataset.txt") self.assertEqual(resolve(ext_changed.path), resolve(correct))
class BaseTrainer(): def __init__(self, root="", lang=None, min_df=5, max_df=sys.maxsize, unknown="<unk>", preprocessor_name="preprocessor", log_dir=""): default_root = os.path.join(os.path.dirname(__file__), "../../") _root = root if root else default_root self.storage = Storage(_root) self.preprocessor_name = preprocessor_name self._base_log_dir = log_dir self._built = False self.preprocessor = Preprocessor(text_transformers=[ ct.text.UnicodeNormalizer(), ct.text.LowerNormalizer() ], tokenizer=ct.Tokenizer(lang=lang), vocabulary=ct.Vocabulary( min_df=min_df, max_df=max_df, unknown=unknown)) def load_preprocessor(self): if os.path.exists(self.preprocessor_path): self._built = True self.preprocessor = joblib.load(self.preprocessor_path) @property def preprocessor_path(self): if self._base_log_dir: path = self._log_dir + "/{}.pkl".format(self.preprocessor_name) return self.storage.data_path(path) else: path = "interim/{}.pkl".format(self.preprocessor_name) return self.storage.data_path(path) @property def _log_dir(self): folder = "/" + self._base_log_dir if self._base_log_dir else "" log_dir = "log{}".format(folder) if not os.path.exists(self.storage.data_path(log_dir)): os.mkdir(self.storage.data_path(log_dir)) return log_dir @property def log_dir(self): return self.storage.data_path(self._log_dir) @property def model_path(self): return self.storage.data_path(self._log_dir + "/model.h5") @property def tensorboard_dir(self): return self.storage.data_path(self._log_dir) def download(self): raise Exception("You have to specify what kinds of data you use.") def build(self, data_kind="train", field="", save=True): if not self._built: self.load_preprocessor() if self._built: print("Load existing preprocessor {}.".format( os.path.basename(self.preprocessor_path))) return 0 r = self.download() if data_kind == "test": data = r.test_data() elif data_kind == "valid": data = r.valid_data() else: data = r.train_data() print("Building Dictionary from {} data...".format(data_kind)) if not field: self.preprocessor.fit(data) else: self.preprocessor.fit(data[field]) if save: joblib.dump(self.preprocessor, self.preprocessor_path) self._built = True print("Done!")
def test_path(self): root = os.path.join(os.path.dirname(__file__), "../../") storage = Storage(root) correct_path = os.path.join(root, "data/raw") self.assertEqual(resolve(storage.data_path("raw")), resolve(correct_path))
class MultiNLIDataset(): def __init__(self, root, min_word_count=3, max_word_count=25, prefix=""): self.storage = Storage(root) self.nlp = spacy.load("en", parser=False, entity=False) self.min_word_count = min_word_count self.max_word_count = max_word_count self.prefix = prefix def train_data(self): return pd.read_csv(self.processed_file("train")) def test_data(self): return pd.read_csv(self.processed_file("test")) @classmethod def labels(self): return [ "fiction", "government", "slate", "telephone", "travel", "nineeleven", "facetoface", "letters", "oup", "verbatim" ] def download(self): download_dir = self.storage.data_path("raw") matched = chazutsu.datasets.MultiNLI.matched().download(download_dir) mismatched = chazutsu.datasets.MultiNLI.mismatched().download( download_dir) for kind in ["train", "test"]: data = self._merge_data(matched, mismatched, kind) data.to_csv(self.interim_file(kind)) preprocessed = self.preprocess(data) preprocessed = pd.concat( [preprocessed["text"], preprocessed["label"]], axis=1) preprocessed.to_csv(self.processed_file(kind), index=False) return self def interim_file(self, kind): if self.prefix: p = "interim/{}_multi_nli_{}.csv".format(self.prefix, kind) else: p = "interim/multi_nli_{}.csv".format(kind) return self.storage.data_path(p) def processed_file(self, kind): if self.prefix: p = "processed/{}_multi_nli_{}.csv".format(self.prefix, kind) else: p = "processed/multi_nli_{}.csv".format(kind) return self.storage.data_path(p) def preprocess(self, df): # Drop duplicates except_d = df.drop_duplicates(["text"]) # Count words word_count = except_d["text"].apply(lambda x: len(self.nlp(x))) except_d = except_d.assign(word_count=pd.Series(word_count).values) limited = except_d[(self.min_word_count <= except_d["word_count"]) & (except_d["word_count"] <= self.max_word_count)] # Equalize data count min_count = limited["label"].value_counts().min() selected = limited.groupby("label").apply( lambda x: x.sample(n=min_count)) selected = selected.drop(columns=["label", "index"]).reset_index() # Convert label to index selected["label"] = selected["label"].apply( lambda x: self.labels().index(x)) return selected def _merge_data(self, matched, mismatched, kind="train"): dataset = [] for d in [matched, mismatched]: if kind == "train": _d = d.dev_data() else: _d = d.test_data() _d = pd.concat([_d["genre"], _d["sentence1"]], axis=1) dataset.append(_d) merged = pd.concat(dataset).reset_index() merged.rename(columns={ "sentence1": "text", "genre": "label" }, inplace=True) return merged
def run_experiment(original=True, attention=True): # Read data root = os.path.join(os.path.dirname(__file__), "../../") storage = Storage(root) gd = GraphDataset(root, kind="cora") data = gd.download(return_mask=original) A, X, Y_train, Y_val, Y_test, idx_train, idx_val, idx_test = data # Parameters N = X.shape[0] # Number of nodes in the graph F = X.shape[1] # Original feature dimension n_classes = Y_train.shape[1] # Number of classes F_ = 8 # Output size of first GraphAttention layer n_attn_heads = 8 # Number of attention heads in first GAT layer dropout_rate = 0.6 # Dropout rate (between and inside GAT layers) l2_reg = 5e-4 / 2 # Factor for l2 regularization learning_rate = 5e-3 # Learning rate for Adam epochs = 120 # Number of training epochs es_patience = 100 # Patience fot early stopping l2 = K.regularizers.l2 node_size = 32 # Preprocessing operations X = preprocess_features(X) A = A + np.eye(A.shape[0]) # Add self-loops # Model definition (as per Section 3.3 of the paper) if original: from gcn.layers.graph_attention_layer_original import GraphAttentionLayer X_in = K.layers.Input(shape=(F, )) A_in = K.layers.Input(shape=(N, )) else: from gcn.layers.graph_attention_layer import GraphAttentionLayer X_in = K.layers.Input(shape=(N, F)) A_in = K.layers.Input(shape=(N, N)) I_in = K.layers.Input(shape=(node_size, ), dtype="int32") dropout1 = K.layers.Dropout(dropout_rate)(X_in) graph_attention_1 = GraphAttentionLayer( feature_units=F_, attn_heads=n_attn_heads, attn_heads_reduction="concat", dropout_rate=dropout_rate, activation="elu", kernel_regularizer=l2(l2_reg), attention=attention, attn_kernel_regularizer=l2(l2_reg))([dropout1, A_in]) dropout2 = K.layers.Dropout(dropout_rate)(graph_attention_1) graph_attention_2 = GraphAttentionLayer( n_classes, attn_heads=1, attn_heads_reduction="average", dropout_rate=dropout_rate, activation="softmax", kernel_regularizer=l2(l2_reg), attention=attention, attn_kernel_regularizer=l2(l2_reg))([dropout2, A_in]) # Build model optimizer = K.optimizers.Adam(lr=learning_rate) if original: model = K.models.Model(inputs=[X_in, A_in], outputs=graph_attention_2) model.compile(optimizer=optimizer, loss="categorical_crossentropy", weighted_metrics=["acc"]) else: output = K.layers.Lambda(lambda x: tf.reshape(tf.batch_gather( x, I_in), (-1, node_size, n_classes)))(graph_attention_2) model = K.models.Model(inputs=[X_in, A_in, I_in], outputs=output) model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["acc"]) model.summary() # Callbacks experiment_dir = "log/gan_experiment" monitor = "val_acc" if original: experiment_dir += "_o" monitor = "val_weighted_acc" if not attention: experiment_dir += "_na" experiment_dir = storage.data_path(experiment_dir) model_path = os.path.join(experiment_dir, "best_model.h5") es_callback = K.callbacks.EarlyStopping(monitor=monitor, patience=es_patience) tb_callback = K.callbacks.TensorBoard(log_dir=experiment_dir) mc_callback = K.callbacks.ModelCheckpoint(model_path, monitor=monitor, save_best_only=True, save_weights_only=True) def batch_generator(indices, label): if len(indices) != len(label): raise Exception("Does not match length") batch_size = len(indices) batch_size = batch_size // node_size def generator(): while True: for i in range(batch_size): _X = np.array([X]) _A = np.array([A]) samples = np.random.randint(len(indices), size=node_size) _i = np.array([indices[samples]]) _label = np.array([label[samples]]) yield [_X, _A, _i], _label return generator(), batch_size if original: validation_data = ([X, A], Y_val, idx_val) model.fit( [X, A], Y_train, sample_weight=idx_train, epochs=epochs, batch_size=N, validation_data=validation_data, shuffle=False, # Shuffling data means shuffling the whole graph callbacks=[es_callback, tb_callback, mc_callback]) # Load best model model.load_weights(model_path) # Evaluate model eval_results = model.evaluate([X, A], Y_test, sample_weight=idx_test, batch_size=N, verbose=0) else: val_generator, val_steps = batch_generator(idx_val, Y_val) train_generator, train_steps = batch_generator(idx_train, Y_train) model.fit_generator(train_generator, train_steps, validation_data=val_generator, validation_steps=val_steps, epochs=epochs, callbacks=[es_callback, tb_callback, mc_callback]) # Load best model model.load_weights(model_path) # Evaluate model test_generator, test_steps = batch_generator(idx_test, Y_test) eval_results = model.evaluate_generator(test_generator, test_steps, verbose=0) print("Done.\n" "Test loss: {}\n" "Test accuracy: {}".format(*eval_results))
class GraphDataset(): def __init__(self, root, kind="cora"): self.storage = Storage(root) self.kind = kind self.download_url = "https://s3-ap-northeast-1.amazonaws.com/dev.tech-sketch.jp/chakki/public/graph/" # noqa if kind == "cora": self.download_url += "cora.zip" elif kind == "citeseer": self.download_url += "citeseer.zip" elif kind == "pubmed": self.download_url += "pubmed.zip" else: raise Exception("Graph dataset {} is not supported.".format(kind)) @property def data_root(self): return self.storage.data_path("raw/{}".format(self.kind)) @property def download_file_path(self): return self.storage.data_path("raw/{}.zip".format(self.kind)) def download(self, return_mask=True): # Check downloaded file if os.path.isdir(self.data_root): print("{} dataset is already downloaded.".format(self.kind)) return self.load(return_mask) # Download dataset resp = requests.get(self.download_url, stream=True) with open(self.download_file_path, "wb") as f: chunk_size = 1024 for data in resp.iter_content(chunk_size=chunk_size): f.write(data) # Expand file with zipfile.ZipFile(self.download_file_path) as z: z.extractall(path=self.data_root) os.remove(self.download_file_path) return self.load(return_mask) def load(self, return_mask): """ Loads input data (reference from: https://github.com/tkipf/gcn/blob/master/gcn/utils.py) ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object; ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object; ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object; ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object; ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object; ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object. All objects above must be saved using python pickle module. :param dataset_str: Dataset name :return: All data input files loaded (as well the training/test data). """ names = ["x", "y", "tx", "ty", "allx", "ally", "graph", "test.index"] objects = [] for n in names: file_path = os.path.join(self.data_root, "ind.{}.{}".format(self.kind, n)) if n != "test.index": with open(file_path, "rb") as f: objects.append(pkl.load(f, encoding="latin1")) else: with open(file_path, encoding="latin1") as f: lines = f.readlines() indices = [int(ln.strip()) for ln in lines] objects.append(indices) x, y, tx, ty, allx, ally, graph, test_idx = tuple(objects) test_idx_range = np.sort(test_idx) if self.kind == "citeseer": # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx), max(test_idx) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx, :] = labels[test_idx_range, :] idx_test = test_idx_range idx_train = np.array(range(len(y))) idx_val = np.array(range(len(y), len(y) + 500)) if return_mask: train_mask = self.sample_mask(idx_train, labels.shape[0]) val_mask = self.sample_mask(idx_val, labels.shape[0]) test_mask = self.sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask else: y_train = labels[idx_train, :] y_val = labels[idx_val, :] y_test = labels[idx_test, :] return adj, features, y_train, y_val, y_test, idx_train, idx_val, idx_test def sample_mask(self, idx, length): """Create mask.""" mask = np.zeros(length) mask[idx] = 1 return np.array(mask, dtype=np.bool)