Python Timer примеры использования

Язык программирования: Python

Пространство имен/Пакет: TimerCounter

Класс/Тип: Timer

Примеров на hotexamples.com: 21

Python Timer - 21 примеров найдено. Это лучшие примеры Python кода для TimerCounter.Timer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

tic(16)

Timer(15)

toc(13)

Основные методы

tic (16)

Timer (15)

toc (13)

Пример #1

Показать файл

    def train(self):
        train_losses = []
        val_losses = []
        model_path = os.path.join(self.model_dir, self.model_file)

        print("Training model...\n")
        timer = Timer()
        timer.tic()

        x = self.data.x.to(self.device)
        train_pos_edge_index = self.data.train_pos_edge_index.to(self.device)

        for epoch in range(self.epochs):
            print("Epoch: {}".format(epoch + 1))
            self.model.train()
            self.optimizer.zero_grad()
            z = self.model.encode(x, train_pos_edge_index)
            loss = self.model.recon_loss(z, train_pos_edge_index)
            if self.model_name == "ARGVA":
                loss = loss + (1 / self.data.num_nodes) * self.model.kl_loss()
            loss += self.dis_loss_para * self.model.discriminator_loss(z) + \
                self.reg_loss_para * self.model.reg_loss(z)
            loss.backward()
            self.optimizer.step()

            # Evaluate on validation data
            self.model.eval()
            with torch.no_grad():
                train_losses.append(loss.cpu().detach().numpy())

                # Compute validation statistics
                val_pos_edge_index = self.data.val_pos_edge_index.to(
                    self.device)
                val_neg_edge_index = self.data.val_neg_edge_index.to(
                    self.device)
                z = self.model.encode(x, train_pos_edge_index)
                val_loss = self.model.recon_loss(z, train_pos_edge_index)
                if self.model_name == "ARGVA":
                    val_loss += (1 /
                                 self.data.num_nodes) * self.model.kl_loss()
                val_loss += self.dis_loss_para * self.model.discriminator_loss(
                    z) + self.reg_loss_para * self.model.reg_loss(z)
                val_losses.append(val_loss.cpu().detach().numpy())
                if val_losses[-1] == min(val_losses):
                    print("\tSaving model...")
                    torch.save(self.model.state_dict(), model_path)
                    print("\tSaved.")
                print("\ttrain_loss=", "{:.5f}".format(loss), "val_loss=",
                      "{:.5f}".format(val_loss))

        print("Finished training.\n")
        training_time = timer.toc()
        self._plot_losses(train_losses, val_losses)
        self._print_stats(train_losses, val_losses, training_time)

Пример #2

Показать файл

Файл: han_preprocess_data.py Проект: andreeaiana/graph_confrec

    def __init__(self, embedding_type, gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "han", self.embedding_type)
        if not os.path.exists(self.path_persistent):
            os.makedirs(self.path_persistent)

Пример #3

Показать файл

    def __init__(self, embedding_type, graph_type, threshold=2, gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.threshold = threshold
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "graphsage", self.embedding_type,
            self.graph_type)
        if not os.path.isdir(self.path_persistent):
            os.mkdir(self.path_persistent)

Пример #4

Показать файл

 def __init__(self):
     self.parser = FileParser()
     self.persistent = {}
     self.timer = Timer()
     self.processes = {
         "chapters_books": {
             "process_data": "_process_data_chapters_books",
             "persistent_file": os.path.join(self.path,
                                             "chapters_books.pkl")
         },
         "chapters_all_scigraph_citations": {
             "process_data":
             "_process_data_chapters_all_scigraph_citations",
             "persistent_file":
             os.path.join(self.path, "chapters_all_scigraph_citations.pkl")
         },
         "chapters_confproc_scigraph_citations": {
             "process_data":
             "_process_data_chapters_confproc_scigraph_citations",
             "persistent_file":
             os.path.join(self.path,
                          "chapters_confproc_scigraph_citations.pkl")
         },
         "books_conferences": {
             "process_data": "_process_data_books_conferences",
             "persistent_file": os.path.join(self.path,
                                             "books_conferences.pkl")
         },
         "author_id_chapters": {
             "process_data":
             "_process_data_author_id_chapters",
             "persistent_file":
             os.path.join(self.path, "author_id_chapters.pkl")
         },
         "author_name_chapters": {
             "process_data":
             "_process_data_author_name_chapters",
             "persistent_file":
             os.path.join(self.path, "author_name_chapters.pkl")
         },
         "confproc_scigraph_citations_chapters": {
             "process_data":
             "_process_data_confproc_scigraph_citations_chapters",
             "persistent_file":
             os.path.join(self.path,
                          "confproc_scigraph_citations_chapters.pkl")
         }
     }

Пример #5

Показать файл

Файл: gat_preprocess_data.py Проект: andreeaiana/graph_confrec

    def __init__(self,
                 embedding_type,
                 dataset,
                 graph_type="directed",
                 threshold=2,
                 gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.dataset = dataset
        self.graph_type = graph_type
        self.threshold = threshold
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "gat", self.embedding_type, self.dataset)
        if not os.path.exists(self.path_persistent):
            os.makedirs(self.path_persistent)

Пример #6

Показать файл

Файл: GraphSAGEClassifierModel.py Проект: andreeaiana/graph_confrec

    def train(self, data):
        if not self._load_model_classifier():
            print("Classifier not trained yet. Training now...")
            timer = Timer()
            timer.tic()

            print("Loading the training embeddings...")
            if not self._load_train_embeddings():
                print("The pretrained embeddings are missing.")
            else:
                print("Loaded.")
            training_ids = list(data.chapter)
            training_embeddings = self.pretrained_embeddings[[
                self.pretrained_embeddings_id_map[id] for id in training_ids
            ]]

            self.label_encoder = LabelEncoder()
            self.labels = self.label_encoder.fit_transform(
                data.conferenceseries)
            self.classifier.fit(training_embeddings, self.labels)
            self._save_model_classifier()

            print("Training finished.")
            timer.toc()

Пример #7

Показать файл

class DatasetsParser:
    path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                        "..", "data", "interim", "parsed_data")

    def __init__(self):
        self.parser = FileParser()
        self.persistent = {}
        self.timer = Timer()
        self.processes = {
            "chapters_books": {
                "process_data": "_process_data_chapters_books",
                "persistent_file": os.path.join(self.path,
                                                "chapters_books.pkl")
            },
            "chapters_all_scigraph_citations": {
                "process_data":
                "_process_data_chapters_all_scigraph_citations",
                "persistent_file":
                os.path.join(self.path, "chapters_all_scigraph_citations.pkl")
            },
            "chapters_confproc_scigraph_citations": {
                "process_data":
                "_process_data_chapters_confproc_scigraph_citations",
                "persistent_file":
                os.path.join(self.path,
                             "chapters_confproc_scigraph_citations.pkl")
            },
            "books_conferences": {
                "process_data": "_process_data_books_conferences",
                "persistent_file": os.path.join(self.path,
                                                "books_conferences.pkl")
            },
            "author_id_chapters": {
                "process_data":
                "_process_data_author_id_chapters",
                "persistent_file":
                os.path.join(self.path, "author_id_chapters.pkl")
            },
            "author_name_chapters": {
                "process_data":
                "_process_data_author_name_chapters",
                "persistent_file":
                os.path.join(self.path, "author_name_chapters.pkl")
            },
            "confproc_scigraph_citations_chapters": {
                "process_data":
                "_process_data_confproc_scigraph_citations_chapters",
                "persistent_file":
                os.path.join(self.path,
                             "confproc_scigraph_citations_chapters.pkl")
            }
        }

    def get_data(self, process):
        # Check if the data is already present
        if (process in self.persistent):
            return self.persistent[process]

        print("Process '{}' not in memory yet.".format(process))
        # Load from persistent file if data already processed
        if os.path.isfile(self.processes[process]["persistent_file"]):
            with open(self.processes[process]["persistent_file"], "rb") as f:
                self.persistent[process] = pickle.load(f)
                return self.persistent[process]

        print("Process '{}' not persistent yet. Processing.".format(process))

        # Process the data
        self.persistent[process] = self._parse_file(
            self.processes[process]["process_data"])

        with open(self.processes[process]["persistent_file"], "wb") as f:
            pickle.dump(self.persistent[process], f)

        return self.persistent[process]

    def _parse_file(self, process_data):
        print("Start processing file.\n")
        self.timer.tic()
        process_data_function = self.__getattribute__(process_data)
        results = process_data_function()
        self.timer.toc()
        return results

    # processes implementation
    def _process_data_chapters_books(self):
        # Load datasets
        df_chapters_books_isbns = pd.DataFrame(
            list(self.parser.get_data("chapters_books_isbns").items()),
            columns=["chapter", "books_isbns"])
        df_isbn_book_ids = pd.DataFrame(list(
            self.parser.get_data("isbn_books").items()),
                                        columns=["isbn", "book"])

        # Process datasets
        df_chapters_books_isbns[["isbn1", "isbn2"]] = pd.DataFrame(
            df_chapters_books_isbns["books_isbns"].tolist(),
            index=df_chapters_books_isbns.index)
        df_chapters_books_isbns.drop(columns=["books_isbns"],
                                     axis=1,
                                     inplace=True)
        df_chapters_isbn1 = pd.merge(
            df_chapters_books_isbns[["chapter", "isbn1"]],
            df_isbn_book_ids,
            how="inner",
            left_on=["isbn1"],
            right_on=["isbn"])
        df_chapters_isbn1.drop(columns=["isbn1", "isbn"], inplace=True)
        df_chapters_isbn2 = pd.merge(
            df_chapters_books_isbns[["chapter", "isbn2"]],
            df_isbn_book_ids,
            how="inner",
            left_on=["isbn2"],
            right_on=["isbn"])
        df_chapters_isbn2.drop(columns=["isbn2", "isbn"], inplace=True)
        df_chapters_books = df_chapters_isbn1.append(df_chapters_isbn2,
                                                     ignore_index=True)
        df_chapters_books.drop_duplicates(inplace=True)
        return df_chapters_books

    def _process_data_chapters_all_scigraph_citations(self):
        df_chapters_citations = pd.DataFrame(
            list(self.parser.get_data("chapters_all_citations").items()),
            columns=["chapter", "chapter_citations"])
        chapters_count = len(df_chapters_citations)
        with tqdm(desc="Processing citations",
                  total=chapters_count,
                  unit="chapter") as pbar:
            for idx in range(chapters_count):
                citations = df_chapters_citations.iloc[idx][
                    "chapter_citations"]
                citations = [
                    c for c in citations
                    if c is not None and c.startswith("sg")
                ]
                df_chapters_citations.iloc[
                        idx]["chapter_citations"] = citations if citations \
                    else np.nan
                pbar.update(1)
        return df_chapters_citations[
            df_chapters_citations["chapter_citations"].notnull()]

    def _process_data_chapters_confproc_scigraph_citations(self):
        df_scigraph_citations = self.get_data(
            "chapters_all_scigraph_citations")
        df_chapters = pd.DataFrame(self.parser.get_data("chapters"),
                                   columns=["chapter"])
        chapters = set(list(df_chapters["chapter"]))
        chapters_count = len(df_scigraph_citations)
        with tqdm(desc="Processing citations",
                  total=chapters_count,
                  unit="chapter") as pbar:
            for idx in range(chapters_count):
                scigraph_citations = df_scigraph_citations.iloc[idx][
                    "chapter_citations"]
                citations = [c for c in scigraph_citations if c in chapters]
                df_scigraph_citations.iloc[idx][
                    "chapter_citations"] = citations if citations else np.nan
                pbar.update(1)
        return df_scigraph_citations[
            df_scigraph_citations["chapter_citations"].notnull()]

    def _process_data_books_conferences(self):
        df_old_books_new_books = pd.DataFrame(list(
            self.parser.get_data("old_books_new_books").items()),
                                              columns=["old_book", "new_book"])
        df_old_books_conferences = pd.DataFrame(
            list(self.parser.get_data("old_books_conferences").items()),
            columns=["old_book", "conference"])
        df = pd.merge(df_old_books_new_books,
                      df_old_books_conferences,
                      how="left",
                      on=["old_book", "old_book"])
        df.drop(columns=["old_book"], axis=1, inplace=True)
        df.rename(columns={
            "new_book": "book",
            "conference": "conference"
        },
                  inplace=True)
        return df[df["conference"].notnull()]

    def _process_data_author_id_chapters(self):
        df_chapters_authors = pd.DataFrame(list(
            self.parser.get_data("chapters_authors").items()),
                                           columns=["chapter", "authors"])
        contributions = []
        for idx in range(len(df_chapters_authors)):
            authors = [
                author for author in df_chapters_authors.iloc[idx]["authors"]
            ]
            chapter = df_chapters_authors.iloc[idx]["chapter"]
            contributions.extend([(author, chapter) for author in authors])
        author_id_chapters = pd.DataFrame.from_records(
            contributions, columns=["author", "chapter"])
        return author_id_chapters

    def _process_data_author_name_chapters(self):
        df_chapters_authors_name = pd.DataFrame(
            list(self.parser.get_data("chapters_authors_name").items()),
            columns=["chapter", "authors_name"])
        contributions = []
        for idx in range(len(df_chapters_authors_name)):
            authors_name = [
                author_name for author_name in
                df_chapters_authors_name.iloc[idx]["authors_name"]
            ]
            chapter = df_chapters_authors_name.iloc[idx]["chapter"]
            contributions.extend([(author_name, chapter)
                                  for author_name in authors_name])
        author_name_chapters = pd.DataFrame.from_records(
            contributions, columns=["author_name", "chapter"])
        return author_name_chapters

    def _process_data_confproc_scigraph_citations_chapters(self):
        df_chapters_confproc_scigraph_citations = self.get_data(
            "chapters_confproc_scigraph_citations")
        citations = []
        for idx in range(len(df_chapters_confproc_scigraph_citations)):
            citation_list = [
                citation
                for citation in df_chapters_confproc_scigraph_citations.
                iloc[idx]["chapter_citations"]
            ]
            chapter = df_chapters_confproc_scigraph_citations.iloc[idx][
                "chapter"]
            citations.extend([(citation, chapter)
                              for citation in citation_list])
        confproc_scigraph_citations_chapter = pd.DataFrame.from_records(
            citations, columns=["citation", "chapter"])
        return confproc_scigraph_citations_chapter

Пример #8

Показать файл

    def inference(self, test_data, gpu_mem_fraction=None):
        print("Inference.")
        timer = Timer()
        timer.tic()

        G = test_data[0]
        features = test_data[1]
        id_map = test_data[2]
        class_map = test_data[4]
        if isinstance(list(class_map.values())[0], list):
            num_classes = len(list(class_map.values())[0])
        else:
            num_classes = len(set(class_map.values()))

        if not features is None:
            # pad with dummy zero vector
            features = np.vstack([features, np.zeros((features.shape[1], ))])

        placeholders = self._construct_placeholders(num_classes)
        minibatch = NodeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          class_map,
                                          num_classes,
                                          batch_size=self.batch_size,
                                          max_degree=self.max_degree)

        adj_info_ph = tf.compat.v1.placeholder(tf.int32,
                                               shape=minibatch.adj.shape)
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")

        model = self._create_model(num_classes, placeholders, features,
                                   adj_info, minibatch)

        config = tf.compat.v1.ConfigProto(
            log_device_placement=self.log_device_placement)
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        # Initialize session
        sess = tf.compat.v1.Session(config=config)
        merged = tf.compat.v1.summary.merge_all()
        #        summary_writer = tf.summary.FileWriter(self._log_dir(), sess.graph)

        # Initialize model saver
        saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs)

        # Init variables
        sess.run(tf.compat.v1.global_variables_initializer(),
                 feed_dict={adj_info_ph: minibatch.adj})

        # Restore model
        print("Restoring trained model.")
        checkpoint_file = os.path.join(self._log_dir(), "model.ckpt")
        ckpt = tf.compat.v1.train.get_checkpoint_state(checkpoint_file)
        if checkpoint_file:
            saver.restore(sess, checkpoint_file)
            print("Model restored.")
        else:
            print("This model checkpoint does not exist. The model might " +
                  "not be trained yet or the checkpoint is invalid.")

        val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj)
        sess.run(val_adj_info.op)

        print("Computing predictions...")
        t_test = time.time()
        finished = False
        val_losses = []
        val_preds = []
        nodes = []
        iter_num = 0
        while not finished:
            feed_dict_val, _, finished, nodes_subset = minibatch.incremental_node_val_feed_dict(
                self.batch_size, iter_num, test=True)
            node_outs_val = sess.run([model.preds, model.loss],
                                     feed_dict=feed_dict_val)
            val_preds.append(node_outs_val[0])
            val_losses.append(node_outs_val[1])
            nodes.extend(nodes_subset)
            iter_num += 1
        val_preds = np.vstack(val_preds)
        print("Computed.")

        # Return only the embeddings of the test nodes
        test_preds_ids = {}
        for i, node in enumerate(nodes):
            test_preds_ids[node] = i
        test_nodes = [n for n in G.nodes() if G.node[n]['test']]
        test_preds = val_preds[[test_preds_ids[id] for id in test_nodes]]
        timer.toc()
        sess.close()
        return test_nodes, test_preds

Пример #9

Показать файл

    def train(self, train_data, test_data=None):
        print("Training model...")
        timer = Timer()
        timer.tic()

        G = train_data[0]
        features = train_data[1]
        id_map = train_data[2]
        class_map = train_data[4]
        if isinstance(list(class_map.values())[0], list):
            num_classes = len(list(class_map.values())[0])
        else:
            num_classes = len(set(class_map.values()))

        if not features is None:
            # pad with dummy zero vector
            features = np.vstack([features, np.zeros((features.shape[1], ))])

        placeholders = self._construct_placeholders(num_classes)
        minibatch = NodeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          class_map,
                                          num_classes,
                                          batch_size=self.batch_size,
                                          max_degree=self.max_degree)

        adj_info_ph = tf.compat.v1.placeholder(tf.int32,
                                               shape=minibatch.adj.shape)
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")

        model = self._create_model(num_classes, placeholders, features,
                                   adj_info, minibatch)

        config = tf.compat.v1.ConfigProto(
            log_device_placement=self.log_device_placement)
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        # Initialize session
        sess = tf.compat.v1.Session(config=config)
        merged = tf.compat.v1.summary.merge_all()
        #        summary_writer = tf.summary.FileWriter(self._log_dir(), sess.graph)

        # Initialize model saver
        saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs)

        # Init variables
        sess.run(tf.compat.v1.global_variables_initializer(),
                 feed_dict={adj_info_ph: minibatch.adj})

        # Train model
        total_steps = 0
        avg_time = 0.0
        epoch_val_costs = []

        train_losses = []
        validation_losses = []

        train_adj_info = tf.compat.v1.assign(adj_info, minibatch.adj)
        val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj)

        for epoch in range(self.epochs):
            minibatch.shuffle()

            iter = 0
            print('Epoch: %04d' % (epoch))
            epoch_val_costs.append(0)
            train_loss_epoch = []
            validation_loss_epoch = []
            while not minibatch.end():
                # Construct feed dictionary
                feed_dict, labels = minibatch.next_minibatch_feed_dict()
                feed_dict.update({placeholders['dropout']: self.dropout})

                t = time.time()
                # Training step
                outs = sess.run(
                    [merged, model.opt_op, model.loss, model.preds],
                    feed_dict=feed_dict)
                train_cost = outs[2]
                train_loss_epoch.append(train_cost)

                if iter % self.validate_iter == 0:
                    # Validation
                    sess.run(val_adj_info.op)
                    if self.validate_batch_size == -1:
                        val_cost, val_f1_mic, val_f1_mac, duration = self._incremental_evaluate(
                            sess, model, minibatch, self.batch_size)
                    else:
                        val_cost, val_f1_mic, val_f1_mac, duration = self._evaluate(
                            sess, model, minibatch, self.validate_batch_size)
                    sess.run(train_adj_info.op)
                    epoch_val_costs[-1] += val_cost
                    validation_loss_epoch.append(val_cost)

#                if total_steps % self.print_every == 0:
#                    summary_writer.add_summary(outs[0], total_steps)

# Print results
                avg_time = (avg_time * total_steps + time.time() -
                            t) / (total_steps + 1)

                if total_steps % self.print_every == 0:
                    train_f1_mic, train_f1_mac = self._calc_f1(
                        labels, outs[-1])
                    print("Iter:", '%04d' % iter, "train_loss=",
                          "{:.5f}".format(train_cost), "train_f1_mic=",
                          "{:.5f}".format(train_f1_mic), "train_f1_mac=",
                          "{:.5f}".format(train_f1_mac), "val_loss=",
                          "{:.5f}".format(val_cost), "val_f1_mic=",
                          "{:.5f}".format(val_f1_mic), "val_f1_mac=",
                          "{:.5f}".format(val_f1_mac), "time=",
                          "{:.5f}".format(avg_time))

                iter += 1
                total_steps += 1

                if total_steps > self.max_total_steps:
                    break

            # Keep track of train and validation losses per epoch
            train_losses.append(sum(train_loss_epoch) / len(train_loss_epoch))
            validation_losses.append(
                sum(validation_loss_epoch) / len(validation_loss_epoch))

            # If the epoch has the lowest validation loss so far
            if validation_losses[-1] == min(validation_losses):
                print(
                    "Minimum validation loss so far ({}) at epoch {}.".format(
                        validation_losses[-1], epoch))
                # Save model at each epoch
                print("Saving model at epoch {}.".format(epoch))
                saver.save(sess, os.path.join(self._log_dir(), "model.ckpt"))

            if total_steps > self.max_total_steps:
                break

        print("Optimization Finished!")

        training_time = timer.toc()
        self._plot_losses(train_losses, validation_losses)
        self._print_stats(train_losses, validation_losses, training_time)

        sess.run(val_adj_info.op)
        val_cost, val_f1_mic, val_f1_mac, duration = self._incremental_evaluate(
            sess, model, minibatch, self.batch_size)
        print("Full validation stats:", "loss=", "{:.5f}".format(val_cost),
              "f1_micro=", "{:.5f}".format(val_f1_mic), "f1_macro=",
              "{:.5f}".format(val_f1_mac), "time=", "{:.5f}".format(duration))
        with open(self._log_dir() + "val_stats.txt", "w") as fp:
            fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f} time={:.5f}".
                     format(val_cost, val_f1_mic, val_f1_mac, duration))

Пример #10

Показать файл

Файл: FileParser.py Проект: andreeaiana/graph_confrec

    def __init__(self):
        self.timer = Timer()
        self.persistent = {}
        self.processes = {
                # Old datasets
                "old_books": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "old_books.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "old_books_new_books": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books_new_books",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "old_books_new_books.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "old_books_conferences": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books_conferences",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "old_books_conferences.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "conferences.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "conferences_name": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_name",
                        "persistent_file": os.path.join(
                                self.path_persistent, "conferences_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_acronym": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_acronym",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_acronym.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_city": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_city",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_city.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_country": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_country",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_country.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_year": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_year",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_year.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_datestart": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_datestart",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_datestart.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_dateend": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_dateend",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_dateend.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_conferenceseries": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_conferenceseries",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_conferenceseries.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferenceseries": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferenceseries",
                        "persistent_file": os.path.join(
                                self.path_persistent, "conferenceseries.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "conferenceseries_name": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferenceseries_name",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferenceseries_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },

                # New datasets
                "books": {
                        "filename": os.path.join(self.path_raw, books_file),
                        "process_line": "_process_line_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "books.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "json"
                        },
                "isbn_books": {
                        "filename": os.path.join(self.path_raw, books_file),
                        "process_line": "_process_line_isbn_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "isbn_books.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "authors_name": {
                        "filename": os.path.join(self.path_raw, authors_file),
                        "process_line": "_process_line_authors_name",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "authors_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "json"
                        },
                "chapters_title": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_title",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters_title.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_year": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_year",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters_year.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_language": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_language",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_language.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_abstract": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_abstract",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_abstract.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_authors": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_authors",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_authors.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_authors_name": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_authors_name",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_authors_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_all_citations": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_all_citations",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_all_citations.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_keywords": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_keywords",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_keywords.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_books_isbns": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_books_isbns",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_books_isbns.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                }

Пример #11

Показать файл

    def train(self, train_data):
        print("Training model...")
        timer = Timer()
        timer.tic()

        G = train_data[0]
        features = train_data[1]
        id_map = train_data[2]

        if features is not None:
            # pad with dummy zero vector
            features = np.vstack([features, np.zeros((features.shape[1], ))])

        context_pairs = train_data[3] if self.random_context else None
        placeholders = self._construct_placeholders()
        minibatch = EdgeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          batch_size=self.batch_size,
                                          max_degree=self.max_degree,
                                          num_neg_samples=self.neg_sample_size,
                                          context_pairs=context_pairs)

        adj_info_ph = tf.compat.v1.placeholder(tf.int32,
                                               shape=minibatch.adj.shape)
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")

        model = self._create_model(placeholders, features, adj_info, minibatch)

        config = tf.compat.v1.ConfigProto(
            log_device_placement=self.log_device_placement)
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        # Initialize session
        sess = tf.compat.v1.Session(config=config)
        merged = tf.compat.v1.summary.merge_all()
        #        summary_writer = tf.compat.v1.summary.FileWriter(self._log_dir(),
        #                                                         sess.graph)

        # Initialize model saver
        saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs)

        # Init variables
        sess.run(tf.compat.v1.global_variables_initializer(),
                 feed_dict={adj_info_ph: minibatch.adj})

        # Train model
        train_shadow_mrr = None
        shadow_mrr = None

        total_steps = 0
        avg_time = 0.0
        epoch_val_costs = []

        train_losses = []
        validation_losses = []

        train_adj_info = tf.compat.v1.assign(adj_info, minibatch.adj)
        val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj)

        for epoch in range(self.epochs):
            minibatch.shuffle()

            iter = 0
            print('Epoch: %04d' % (epoch))
            epoch_val_costs.append(0)
            train_loss_epoch = []
            validation_loss_epoch = []
            while not minibatch.end():
                # Construct feed dictionary
                feed_dict = minibatch.next_minibatch_feed_dict()
                feed_dict.update({placeholders['dropout']: self.dropout})

                t = time.time()
                # Training step
                outs = sess.run([
                    merged, model.opt_op, model.loss, model.ranks,
                    model.aff_all, model.mrr, model.outputs1
                ],
                                feed_dict=feed_dict)

                train_cost = outs[2]
                train_mrr = outs[5]
                train_loss_epoch.append(train_cost)
                if train_shadow_mrr is None:
                    train_shadow_mrr = train_mrr
                else:
                    train_shadow_mrr -= (1 - 0.99) * (train_shadow_mrr -
                                                      train_mrr)

                if iter % self.validate_iter == 0:
                    # Validation
                    sess.run(val_adj_info.op)
                    val_cost, ranks, val_mrr, duration = self._evaluate(
                        sess, model, minibatch, size=self.validate_batch_size)
                    sess.run(train_adj_info.op)
                    epoch_val_costs[-1] += val_cost
                    validation_loss_epoch.append(val_cost)
                if shadow_mrr is None:
                    shadow_mrr = val_mrr
                else:
                    shadow_mrr -= (1 - 0.99) * (shadow_mrr - val_mrr)

#                if total_steps % self.print_every == 0:
#                    summary_writer.add_summary(outs[0], total_steps)

# Print results
                avg_time = (avg_time * total_steps + time.time() -
                            t) / (total_steps + 1)

                if total_steps % self.print_every == 0:
                    print(
                        "Iter: %04d" % iter,
                        "train_loss={:.5f}".format(train_cost),
                        "train_mrr={:.5f}".format(train_mrr),
                        # exponential moving average
                        "train_mrr_ema={:.5f}".format(train_shadow_mrr),
                        "val_loss={:.5f}".format(val_cost),
                        "val_mrr={:.5f}".format(val_mrr),
                        # exponential moving average
                        "val_mrr_ema={:.5f}".format(shadow_mrr),
                        "time={:.5f}".format(avg_time))

                iter += 1
                total_steps += 1

                if total_steps > self.max_total_steps:
                    break

            # Keep track of train and validation losses per epoch
            train_losses.append(sum(train_loss_epoch) / len(train_loss_epoch))
            validation_losses.append(
                sum(validation_loss_epoch) / len(validation_loss_epoch))

            # Save embeddings if the epoch has the lowest validation loss
            # so far
            if self.save_embeddings and validation_losses[-1] == min(
                    validation_losses):
                print(
                    "Minimum validation loss so far ({}) at epoch {}.".format(
                        validation_losses[-1], epoch))
                sess.run(val_adj_info.op)
                self._save_embeddings(sess, model,
                                      minibatch, self.validate_batch_size,
                                      self._log_dir())

            # Save model at each epoch
            print("Saving model at epoch {}.".format(epoch))
            saver.save(sess,
                       os.path.join(self._log_dir(),
                                    "model_epoch_" + str(epoch) + ".ckpt"),
                       global_step=total_steps)

            if total_steps > self.max_total_steps:
                break

        print("Optimization finished!\n")

        training_time = timer.toc()
        self._plot_losses(train_losses, validation_losses)
        self._print_stats(train_losses, validation_losses, training_time)

Пример #12

Показать файл

    def train(self):
        # Make the datasets iterable
        batch_size = 10000

        train_data_loader = torch.utils.data.DataLoader(
            dataset=self.training_data, batch_size=batch_size)
        validation_data_loader = torch.utils.data.DataLoader(
            dataset=self.validation_data, batch_size=batch_size)
        train_labels_loader = torch.utils.data.DataLoader(
            dataset=self.training_labels, batch_size=batch_size)
        validation_labels_loader = torch.utils.data.DataLoader(
            dataset=self.validation_labels, batch_size=batch_size)

        # Train the model
        timer = Timer()
        timer.tic()

        mean_train_losses = []
        mean_validation_losses = []

        for epoch in range(self.epochs):
            print("Epoch: {}".format(epoch + 1))
            train_losses = []
            validation_losses = []
            self.model.train()

            for i, (train_data, train_labels) in enumerate(
                    zip(train_data_loader, train_labels_loader)):
                self.model.train()
                self.optimizer.zero_grad()
                outputs = self.model(train_data)
                loss = self.cross_entropy_loss(outputs.squeeze(), train_labels)
                loss.backward()
                self.optimizer.step()
                train_losses.append(loss.item())

                # Compute validation loss
                self.model.eval()
                with torch.no_grad():
                    for _, (val_data, val_labels) in enumerate(
                            zip(validation_data_loader,
                                validation_labels_loader)):
                        val_pred = self.model(val_data)
                        val_loss = self.cross_entropy_loss(
                            val_pred.squeeze(), val_labels)
                        validation_losses.append(val_loss.item())

            print("\tTrain loss: {}, validation loss: {}".format(
                np.mean(train_losses), np.mean(validation_losses)))
            mean_train_losses.append(np.mean(train_losses))
            mean_validation_losses.append(np.mean(validation_losses))
            if mean_validation_losses[-1] == min(mean_validation_losses):
                print("\tSaving model...")
                torch.save(self.model.state_dict(), self.model_path)
                print("\tSaved.")

        print("Finished training.")
        training_time = timer.toc()
        self._plot_losses(mean_train_losses, mean_validation_losses)
        self._print_stats(mean_train_losses, mean_validation_losses,
                          training_time)

Пример #13

Показать файл

Файл: GraphSAGEClassifierConcatEvaluation.py Проект: andreeaiana/graph_confrec

    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for GraphSAGE concatenated ' +
            'classifier model evaluation.')
        parser.add_argument(
            "classifier_name",
            choices=["KNN", "MLP", "MultinomialLogisticRegression"],
            help="The name of the classifier.")
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('model_checkpoint_citations',
                            help='Name of the GraphSAGE model checkpoint ' +
                            'for the citations graph.')
        parser.add_argument('model_checkpoint_authors',
                            help='Name of the GraphSAGE model checkpoint ' +
                            'for the authors graph.')
        parser.add_argument('train_prefix_citations',
                            help='Name of the object file that stores the ' +
                            'citations training data.')
        parser.add_argument('train_prefix_authors',
                            help='Name of the object file that stores the ' +
                            'authors training data.')
        parser.add_argument('model_name',
                            choices=[
                                "graphsage_mean", "gcn", "graphsage_seq",
                                "graphsage_maxpool", "graphsage_meanpool"
                            ],
                            help="Model names.")
        parser.add_argument('--model_size',
                            choices=["small", "big"],
                            default="small",
                            help="Can be big or small; model specific def'ns")
        parser.add_argument('--learning_rate',
                            type=float,
                            default=0.00001,
                            help='Initial learning rate.')
        parser.add_argument('--epochs',
                            type=int,
                            default=10,
                            help='Number of epochs to train.')
        parser.add_argument('--dropout',
                            type=float,
                            default=0.0,
                            help='Dropout rate (1 - keep probability).')
        parser.add_argument('--weight_decay',
                            type=float,
                            default=0.0,
                            help='Weight for l2 loss on embedding matrix.')
        parser.add_argument('--max_degree',
                            type=int,
                            default=100,
                            help='Maximum node degree.')
        parser.add_argument('--samples_1',
                            type=int,
                            default=25,
                            help='Number of samples in layer 1.')
        parser.add_argument('--samples_2',
                            type=int,
                            default=10,
                            help='Number of users samples in layer 2.')
        parser.add_argument('--dim_1',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--dim_2',
                            type=int,
                            default=128,
                            help='Size of output dim ' +
                            '(final is 2x this, if using concat)')
        parser.add_argument('--random_context',
                            action="store_false",
                            default=True,
                            help='Whether to use random context or direct ' +
                            'edges.')
        parser.add_argument('--neg_sample_size',
                            type=int,
                            default=20,
                            help='Number of negative samples.')
        parser.add_argument('--batch_size',
                            type=int,
                            default=512,
                            help='Minibatch size.')
        parser.add_argument('--identity_dim',
                            type=int,
                            default=0,
                            help='Set to positive value to use identity ' +
                            'embedding features of that dimension.')
        parser.add_argument('--save_embeddings',
                            action="store_true",
                            default=False,
                            help='Whether to save embeddings for all nodes ' +
                            'after training')
        parser.add_argument('--base_log_dir',
                            default='../../../data/processed/graphsage/',
                            help='Base directory for logging and saving ' +
                            'embeddings')
        parser.add_argument('--validate_iter',
                            type=int,
                            default=5000,
                            help='How often to run a validation minibatch.')
        parser.add_argument('--validate_batch_size',
                            type=int,
                            default=256,
                            help='How many nodes per validation sample.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        parser.add_argument('--print_every',
                            type=int,
                            default=50,
                            help='How often to print training info.')
        parser.add_argument('--max_total_steps',
                            type=int,
                            default=10**10,
                            help='Maximum total number of iterations.')
        parser.add_argument('--log_device_placement',
                            action="store_true",
                            default=False,
                            help='Whether to log device placement.')
        parser.add_argument('--recs',
                            type=int,
                            default=10,
                            help='Number of recommendations.')
        args = parser.parse_args()

        print("Starting evaluation...")
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
        print("Using GPU {}.".format(str(args.gpu)))

        from GraphSAGEClassifierConcatEvaluation import GraphSAGEClassifierConcatEvaluation
        evaluation_model = GraphSAGEClassifierConcatEvaluation(
            args.classifier_name, args.embedding_type, args.model_name,
            args.model_size, args.learning_rate, args.gpu, args.recs)

        # Initialize GraphSAGE models
        graphsage_model_citations = UnsupervisedModel(
            args.train_prefix_citations, args.model_name, args.model_size,
            args.learning_rate, args.epochs, args.dropout, args.weight_decay,
            args.max_degree, args.samples_1, args.samples_2, args.dim_1,
            args.dim_2, args.random_context, args.neg_sample_size,
            args.batch_size, args.identity_dim, args.save_embeddings,
            args.base_log_dir, args.validate_iter, args.validate_batch_size,
            args.gpu, args.print_every, args.max_total_steps,
            args.log_device_placement)
        graphsage_model_authors = UnsupervisedModel(
            args.train_prefix_authors, args.model_name, args.model_size,
            args.learning_rate, args.epochs, args.dropout, args.weight_decay,
            args.max_degree, args.samples_1, args.samples_2, args.dim_1,
            args.dim_2, args.random_context, args.neg_sample_size,
            args.batch_size, args.identity_dim, args.save_embeddings,
            args.base_log_dir, args.validate_iter, args.validate_batch_size,
            args.gpu, args.print_every, args.max_total_steps,
            args.log_device_placement)

        # Train model if needed:
        if not evaluation_model._has_persistent_model():
            print("Classifier not trained yet. Training now...")
            timer = Timer()
            timer.tic()
            evaluation_model.train(graphsage_model_citations,
                                   graphsage_model_authors)
            print("Training finished.")
            timer.toc()
        else:
            evaluation_model._load_model_classifier()

        # Load test data
        print("Loading test data...")
        query_test, query_test_authors, truth = evaluation_model.load_data()
        print("Loaded.")

        # Infer embeddings
        print("Inferring embeddings for citations graph.")
        queue_citations = mp.Queue()
        process_citations = mp.Process(
            target=evaluation_model.infer_embeddings,
            args=(query_test, None, "citations", graphsage_model_citations,
                  args.model_checkpoint_citations, queue_citations))
        process_citations.start()
        embeddings_citations = queue_citations.get()
        process_citations.join()
        process_citations.terminate()

        print("Inferring embeddings for authors graphs.")
        queue_authors = mp.Queue()
        process_authors = mp.Process(target=evaluation_model.infer_embeddings,
                                     args=(query_test, query_test_authors,
                                           "authors", graphsage_model_authors,
                                           args.model_checkpoint_authors,
                                           queue_authors))
        process_authors.start()
        embeddings_authors = queue_authors.get()
        process_authors.join()
        process_authors.terminate()

        # Concatenate embeddings
        test_embeddings = np.concatenate(
            (embeddings_citations, embeddings_authors), axis=1)

        print("Computing predictions...")
        recommendation = evaluation_model.compute_predictions(test_embeddings)
        print("Predictions computed.")

        # Evaluate
        print("Evaluating...")
        evaluation = EvaluationContainer()
        evaluation.evaluate(recommendation, truth)
        print("Finished.")

Пример #14

Показать файл

class Processor():
    def __init__(self, embedding_type, graph_type, threshold=2, gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.graph_type = graph_type
        self.threshold = threshold
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "graphsage", self.embedding_type,
            self.graph_type)
        if not os.path.isdir(self.path_persistent):
            os.mkdir(self.path_persistent)

    def training_data(self, num_walks=50):
        self.prefix = "train_val"
        self.timer.tic()
        print("Creating training files.")

        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        # Create and save graph
        self.G = nx.Graph()

        # Add nodes and edges
        print("Adding training nodes.")
        self._add_nodes(df_train, test=False, val=False)

        print("Adding training edges.")
        if self.graph_type == "citations" or self.graph_type == "authors":
            if self.graph_type == "authors":
                df_train = d_train.author_names().data
            self._add_edges(df_train)
        elif self.graph_type == "citations_authors_het_edges":
            # Adding heterogeneous edges
            # Add citation edges
            self._add_weighted_edges_citations(df_train)
            # Add author edges
            df_train = d_train.author_names().data
            self._add_weighted_edges_authors(df_train)
        else:
            raise KeyError("Graph type unknown.")

        print("Adding validation nodes.")
        self._add_nodes(df_validation, test=False, val=True)

        print("Adding validation edges.")
        if self.graph_type == "citations" or self.graph_type == "authors":
            if self.graph_type == "authors":
                df_validation = d_val.author_names().data
            self._add_edges(df_validation)
        elif self.graph_type == "citations_authors_het_edges":
            # Add citation edges
            self._add_weighted_edges_citations(df_validation)
            # Add author edges
            df_validation = d_val.author_names().data
            self._add_weighted_edges_authors(df_validation)
        else:
            raise KeyError("Graph type unknown.")

        if self.graph_type == "citations_authors_het_edges":
            # Remove edges with weight lower than threshold
            remove_edges = [(u, v) for u, v, e in self.G.edges(data=True)
                            if e["weight"] < self.threshold]
            self.G.remove_edges_from(remove_edges)
            # Clear edge attributes
            for n1, n2, d in self.G.edges(data=True):
                d.clear()
            print("Edges in graph: {}.\n".format(self.G.number_of_edges()))

        print("Removing nodes without features.")
        for node in list(self.G.nodes()):
            if "feature" not in self.G.nodes[node].keys():
                self.G.remove_node(node)
        print("Nodes in graph: {}, edges in graph: {}.\n".format(
            self.G.number_of_nodes(), self.G.number_of_edges()))

        print("Saving graph to disk.")
        G_data = json_graph.node_link_data(self.G)
        with open(os.path.join(self.path_persistent, self.prefix + "-G.json"),
                  "w") as f:
            f.write(json.dumps(G_data))

        # Create and save class map
        self.label_encoder = OneHotEncoder(handle_unknown='ignore',
                                           sparse=False,
                                           dtype=np.int)
        data = df_train.append(df_validation, ignore_index=True)
        labels = data.conferenceseries.unique()
        labels = labels.reshape(-1, 1)
        self.label_encoder.fit(labels)
        self._create_class_map(data)

        # Create and save id map
        self._create_id_map()

        # Create and save features
        self._create_features()

        # Perform and save random walks
        nodes = [
            n for n in list(self.G.nodes())
            if not self.G.node[n]["val"] and not self.G.node[n]["test"]
        ]
        subgraph = self.G.subgraph(nodes)
        self._run_random_walks(subgraph, nodes, num_walks)

        print("Finished creating training files.")
        self.timer.toc()

        # print some statistics
        self._get_stats()

        # Plot degree histogram
        self._degree_histogram()

    def test_data(self,
                  df_test,
                  G_train,
                  authors_df=None,
                  class_map=None,
                  normalize=True):
        # TO DO: Add case for authors
        self.prefix = "test"
        print("Preprocessing data...")
        self.G = G_train
        print("Training graph has {} nodes and {} edges.\n".format(
            self.G.number_of_nodes(), self.G.number_of_edges()))

        # Add nodes and edges
        print("Adding test nodes.")
        self._add_nodes(df_test, test=True, val=False)

        print("Adding test edges.")
        if self.graph_type == "citations" or self.graph_type == "authors":
            if self.graph_type == "authors":
                if authors_df is not None:
                    df_test = pd.merge(df_test,
                                       authors_df,
                                       how="left",
                                       on=["chapter", "chapter"])
                else:
                    raise ValueError("Chapter authors are missing.")
            self._add_edges(df_test)
        elif self.graph_type == "citations_authors_het_edges":
            # Adding heterogeneous edges
            # Add citation edges
            self._add_weighted_edges_citations(df_test)

            # Add author edges
            if authors_df is not None:
                df_test = pd.merge(df_test,
                                   authors_df,
                                   how="left",
                                   on=["chapter", "chapter"])
            else:
                raise ValueError("Chapter authors are missing.")
            self._add_weighted_edges_authors(df_test)

            # Remove edges with weight lower than threshold
            remove_edges = [
                (u, v) for u, v, e in self.G.edges(data=True)
                if "weight" in e.keys() and e["weight"] < self.threshold
            ]
            self.G.remove_edges_from(remove_edges)

            # Clear edge attributes
            for n1, n2, d in self.G.edges(data=True):
                d.clear()
            print("Edges in graph: {}.\n".format(self.G.number_of_edges()))
        else:
            raise KeyError("Graph type unknown.")

        print("Removing nodes without features.")
        for node in list(self.G.nodes()):
            if "feature" not in self.G.nodes[node].keys():
                self.G.remove_node(node)
        print("Nodes in graph: {}, edges in graph: {}.\n".format(
            self.G.number_of_nodes(), self.G.number_of_edges()))

        # Remove all nodes that do not have val/test annotations
        broken_count = 0
        for node in self.G.nodes():
            if 'val' not in self.G.node[node] or 'test' not in self.G.node[
                    node]:
                self.G.remove_node(node)
                broken_count += 1
        print(
            "Removed {} nodes that lacked proper annotations due to networkx versioning issues."
            .format(broken_count))

        # Make sure the graph has edge train_removed annotations
        for edge in self.G.edges():
            if (self.G.node[edge[0]]['val'] or self.G.node[edge[1]]['val']
                    or self.G.node[edge[0]]['test']
                    or self.G.node[edge[1]]['test']):
                self.G[edge[0]][edge[1]]['train_removed'] = True
            else:
                self.G[edge[0]][edge[1]]['train_removed'] = False

        # Create and process id map
        id_map = self._create_id_map()

        if isinstance(list(self.G.nodes)[0], int):
            conversion = lambda n: int(n)
        else:
            conversion = lambda n: n
        id_map = {conversion(k): int(v) for k, v in id_map.items()}

        # Create and process features
        features = self._create_features()

        if normalize:
            train_ids = np.array([
                id_map[n] for n in self.G.nodes()
                if not self.G.node[n]['val'] and not self.G.node[n]['test']
            ])
            train_feats = features[train_ids]
            scaler = StandardScaler()
            scaler.fit(train_feats)
            features = scaler.transform(features)
        print("Finished preprocessing data.")

        # print some statistics
        self._get_stats()

        # Plot degree histogram
        self._degree_histogram()

        # Add "fake" temporary classes for test nodes in class map
        if class_map is not None:
            test_nodes = [n for n in self.G.nodes() if self.G.node[n]['test']]
            for test_node in test_nodes:
                class_map[test_node] = np.zeros(
                    (len(class_map[list(class_map.keys())[0]]), ), dtype=int)
            return self.G, features, id_map, class_map

        return self.G, features, id_map

    def _add_nodes(self, data, test=False, val=False):
        with tqdm(desc="Adding nodes: ", total=len(data), unit="node") as pbar:
            for idx in range(len(data)):
                self.G.add_node(
                    data.chapter.iloc[idx],
                    test=test,
                    feature=np.concatenate(
                        (self.embeddings_parser.embed_sequence(
                            data.chapter_title.iloc[idx], self.embedding_type),
                         self.embeddings_parser.embed_sequence(
                             data.chapter_abstract.iloc[idx],
                             self.embedding_type)),
                        axis=0).tolist(),
                    val=val)
                pbar.update(1)
        print("Nodes in graph: {}.\n".format(self.G.number_of_nodes()))

    def _add_edges(self, data):
        if self.graph_type == "citations":
            self._add_edges_citations(data)
        elif self.graph_type == "authors":
            self._add_edges_authors(data)
        else:
            raise KeyError("Graph type unknown.")

    def _add_edges_citations(self, data):
        """Adds edges between papers that share a citation.
        """
        with tqdm(desc="Adding edges: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                self.G.add_edges_from([
                    (data.chapter.iloc[idx],
                     data.chapter_citations.iloc[idx][i])
                    for i in range(len(data.chapter_citations.iloc[idx]))
                ])
                pbar.update(1)
        print("Edges in graph: {}.\n".format(self.G.number_of_edges()))

    def _add_weighted_edges_citations(self, data):
        """Adds edges between papers that share a citation.
        """
        with tqdm(desc="Adding edges: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                self.G.add_edges_from(
                    [(data.chapter.iloc[idx],
                      data.chapter_citations.iloc[idx][i])
                     for i in range(len(data.chapter_citations.iloc[idx]))],
                    weight=100)
                pbar.update(1)
        print("Edges in graph: {}.\n".format(self.G.number_of_edges()))

    def _add_edges_authors(self, data):
        """Adds edges between papers sharing an author.
        """
        data_grouped = data.groupby("author_name")["chapter"].agg(
            list).reset_index()
        with tqdm(desc="Adding edges: ", total=len(data_grouped)) as pbar:
            for idx in range(len(data_grouped)):
                self.G.add_edges_from(
                    combinations(data_grouped.iloc[idx].chapter, 2))
                pbar.update(1)
        print("Edges in graph: {}.\n".format(self.G.number_of_edges()))

    def _add_weighted_edges_authors(self, data):
        """Adds edges between papers sharing an author.
        """
        data_grouped = data.groupby("author_name")["chapter"].agg(
            list).reset_index()
        with tqdm(desc="Adding edges: ", total=len(data_grouped)) as pbar:
            for idx in range(len(data_grouped)):
                edges = combinations(data_grouped.iloc[idx].chapter, 2)
                for edge in edges:
                    if self.G.has_edge(edge[0], edge[1]):
                        self.G[edge[0]][edge[1]]["weight"] += 1
                    else:
                        self.G.add_edge(edge[0], edge[1], weight=1)
                pbar.update(1)
        print("Edges in graph: {}.\n".format(self.G.number_of_edges()))

    def _create_class_map(self, data):
        print("Creating class map.")
        nodes = list(self.G.nodes)
        class_map = {
            nodes[i]: [
                int(j) for j in list(
                    self.label_encoder.transform(
                        np.array(data[data.chapter == nodes[i]].
                                 conferenceseries).reshape(-1, 1))[0])
            ]
            for i in range(len(nodes))
        }
        print("Saving class map to disk.")
        with open(
                os.path.join(self.path_persistent,
                             self.prefix + "-class_map.json"), "w") as f:
            f.write(json.dumps(class_map))

        with open(os.path.join(self.path_persistent, "label_encoder.pkl"),
                  "wb") as f:
            pickle.dump(self.label_encoder, f)

    def _create_id_map(self):
        if self.prefix == "train_val":
            print("Creating id map.")

        nodes = list(self.G.nodes)
        id_map = {nodes[i]: i for i in range(len(nodes))}

        if self.prefix == "test":
            return id_map
        else:
            print("Saving id map to disk.")
            with open(
                    os.path.join(self.path_persistent,
                                 self.prefix + "-id_map.json"), "w") as f:
                f.write(json.dumps(id_map))

    def _create_features(self):
        if self.prefix == "train_val":
            print("Creating features.")

        features = np.array(
            [self.G.nodes[node]["feature"] for node in list(self.G.nodes)])

        if self.prefix == "test":
            return features
        else:
            print("Saving features to disk.")
            np.save(
                os.path.join(self.path_persistent, self.prefix + "-feats.npy"),
                features)

    def _run_random_walks(self, graph, nodes, num_walks):
        print("Running random walks.")
        walks = run_random_walks(graph, nodes, num_walks=num_walks)
        print("Saving random walks to disk.")
        with open(
                os.path.join(self.path_persistent, self.prefix + "-walks.txt"),
                "w") as fp:
            fp.write("\n".join([str(w[0]) + "\t" + str(w[1]) for w in walks]))

    def _get_stats(self):
        degree_sequence = sorted([d for n, d in self.G.degree()], reverse=True)
        degree_count = Counter(degree_sequence)

        with open(
                os.path.join(self.path_persistent, self.prefix + "-stats.txt"),
                "w") as fp:
            self._print(
                "Number of nodes in the graph: {}\n".format(
                    self.G.number_of_nodes()), fp)
            self._print(
                "Number of edges in the graph: {}\n".format(
                    self.G.number_of_edges()), fp)
            self._print(
                "The graph is connected: {}\n".format(nx.is_connected(self.G)),
                fp)
            self._print(
                "Number of connected components: {}\n".format(
                    nx.number_connected_components(self.G)), fp)
            self._print(
                "Number of self-loops: {}\n".format(
                    nx.number_of_selfloops(self.G)), fp)
            self._print("Maximum degree: {}\n".format(max(degree_count)), fp)
            self._print("Minimum degree: {}\n".format(min(degree_count)), fp)
            self._print(
                "Average degree: {}\n".format(
                    sum(degree_sequence) / len(self.G)), fp)

    def _degree_histogram(self):
        # Plot degree histogram
        degree_sequence = sorted([d for n, d in self.G.degree()], reverse=True)
        degree_count = Counter(degree_sequence)
        deg, cnt = zip(*degree_count.items())

        fig, ax = plt.subplots()
        plt.bar(deg, cnt, width=0.80, color='b')
        plt.title("Degree Histogram")
        plt.ylabel("Count")
        plt.xlabel("Degree")
        ax.set_xticks([d + 0.4 for d in deg])
        ax.set_xticklabels(deg)

        plt.savefig(os.path.join(self.path_persistent,
                                 self.prefix + "-degree_histogram.png"),
                    bbox_inches="tight")

    def _print(self, text, f):
        print(text)
        f.write(text)

    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for data preprocessing.')
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('dataset',
                            help='Name of the object file that stores the ' +
                            'training data.')
        parser.add_argument('--threshold',
                            type=int,
                            default=2,
                            help='Threshold for edge weights in ' +
                            'heterogeneous graph.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        args = parser.parse_args()
        print("Starting...")
        from preprocess_data import Processor
        processor = Processor(args.embedding_type, args.dataset,
                              args.threshold, args.gpu)
        processor.training_data()
        print("Finished.")

    if __name__ == "__main__":
        main()

Пример #15

Показать файл

class Processor:
    def __init__(self, embedding_type, dataset, gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.dataset = dataset
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "gat", self.embedding_type, self.dataset)
        if not os.path.exists(self.path_persistent):
            os.makedirs(self.path_persistent)

    def training_data(self):
        self.timer.tic()
        print("Creating training files.\n")

        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)

        # Create file with feature vectors for both training and validation
        # data (as a scipy.sparse.csr.csr_matrix object)
        print("Creating feature vectors for training and validation data.")
        train_val_features = self._create_features(train_val_data)
        print("Created.")
        print("Saving to disk...")
        allx_file = os.path.join(self.path_persistent,
                                 "ind." + self.dataset + ".allx")
        with open(allx_file, "wb") as f:
            pickle.dump(train_val_features, f)
        print("Saved.\n")

        # Create file with feature vectors only for training data
        # (as a scipy.sparse.csr.csr_matrix object)
        print("Creating feature vectors for training data.")
        train_features = train_val_features[:len(df_train)]
        print("Created.")
        print("Saving to disk...")
        x_file = os.path.join(self.path_persistent,
                              "ind." + self.dataset + ".x")
        with open(x_file, "wb") as f:
            pickle.dump(train_features, f)
        print("Saved.\n")

        # Create file with the labels for the training and validation data
        # (as a numpy.ndarray object)
        print("Creating labels for training and validation data.")
        self._train_label_encoder(train_val_data)
        train_val_labels = self.label_encoder.transform(
            np.array(train_val_data.conferenceseries).reshape(-1, 1))
        print("Created")
        print("Saving to disk...")
        ally_file = os.path.join(self.path_persistent,
                                 "ind." + self.dataset + ".ally")
        with open(ally_file, "wb") as f:
            pickle.dump(train_val_labels, f)
        print("Saved.\n")

        # Create file with the labels for the training data
        # (as a numpy.ndarray object)
        print("Creating labels for training data.")
        train_labels = train_val_labels[:len(df_train)]
        print("Created.")
        print("Saving to disk...")
        y_file = os.path.join(self.path_persistent,
                              "ind." + self.dataset + ".y")
        with open(y_file, "wb") as f:
            pickle.dump(train_labels, f)
        print("Saved.\n")

        # Create a dict in the format {index: [index_of_neighbor_nodes]}
        # (as a collections.defaultdict object)
        print("Creating dictionary of neighbours.")
        graph = defaultdict(list)
        with tqdm(desc="Adding neighbours: ",
                  total=len(train_val_data)) as pbar:
            for idx in range(len(train_val_data)):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in train_val_data.chapter_citations.iloc[idx]
                ]
                neighbours = [c[0] for c in citations_indices if c]
                graph[idx].extend(neighbours)
                for node in neighbours:
                    graph[node].append(idx)
                pbar.update(1)
        with tqdm(desc="Removing duplicates: ",
                  total=len(graph.keys())) as pbar:
            for idx in range(len(graph.keys())):
                graph[idx] = list(set(graph[idx]))
                pbar.update(1)
        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent,
                                  "ind." + self.dataset + ".graph")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")

        print("Statistics")
        print("\tTraining data features: {}.".format(train_features.shape))
        print("\tTraining data labels: {}.".format(len(train_labels)))
        print("\tTraining and validation data features: {}.".format(
            train_val_features.shape))
        print("\tTraining and validation data labels: {}.".format(
            len(train_val_labels)))
        print("\tGraph size: {}.".format(len(graph)))

    def _create_features(self, data):
        features = []
        with tqdm(desc="Creating features: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                features.append(
                    np.concatenate((self.embeddings_parser.embed_sequence(
                        data.chapter_title.iloc[idx], self.embedding_type),
                                    self.embeddings_parser.embed_sequence(
                                        data.chapter_abstract.iloc[idx],
                                        self.embedding_type)),
                                   axis=0).tolist())
                pbar.update(1)
        return sp.csr.csr_matrix(np.array(features))

    def _train_label_encoder(self, data):
        self.label_encoder = OneHotEncoder(handle_unknown='ignore',
                                           sparse=False,
                                           dtype=np.int)
        labels = data.conferenceseries.unique()
        labels = labels.reshape(-1, 1)
        self.label_encoder.fit(labels)
        with open(os.path.join(self.path_persistent, "label_encoder.pkl"),
                  "wb") as f:
            pickle.dump(self.label_encoder, f)

    def test_data(self, df_test, train_features, train_labels,
                  train_val_features, train_val_labels, graph):
        print("Preprocessing data...")
        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)

        # Create the indices of test instances in graph (as a list object)
        test_indices = list(df_test.index)

        # Create "fake" temporary labels for test data
        test_labels = np.zeros((len(df_test), len(train_val_labels[0])),
                               dtype=int)

        # Update graph with test data
        print("Updating graph information...")
        with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar:
            for idx in list(df_test.index):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in df_test.chapter_citations.loc[idx]
                ]
                neighbours = [c[0] for c in citations_indices if c]
                graph[idx].extend(neighbours)
                for node in neighbours:
                    graph[node].append(idx)
                pbar.update(1)
        with tqdm(desc="Removing duplicates: ",
                  total=len(graph.keys())) as pbar:
            for idx in range(len(graph.keys())):
                graph[idx] = list(set(graph[idx]))
                pbar.update(1)
        print("Updated.")

        # Create feature vectors of test instances
        print("Creating features for test data...")
        test_features = self._create_features(df_test)
        print("Created.")

        max_degree = len(max(graph.values(), key=len))
        test_idx_range = np.sort(test_indices)
        features = sp.vstack((train_val_features, test_features)).tolil()
        features[test_indices, :] = features[test_idx_range, :]
        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

        labels = np.vstack((train_val_labels, test_labels))
        labels[test_indices, :] = labels[test_idx_range, :]

        idx_test = test_idx_range.tolist()
        idx_train = range(len(train_labels))
        idx_val = range(len(train_labels), len(train_val_labels))

        train_mask = sample_mask(idx_train, labels.shape[0])
        val_mask = sample_mask(idx_val, labels.shape[0])
        test_mask = sample_mask(idx_test, labels.shape[0])

        y_train = np.zeros(labels.shape)
        y_val = np.zeros(labels.shape)
        y_test = np.zeros(labels.shape)
        y_train[train_mask, :] = labels[train_mask, :]
        y_val[val_mask, :] = labels[val_mask, :]
        y_test[test_mask, :] = labels[test_mask, :]
        print("Finished preprocessing data.\n")

        print("Adjacency matrix shape: {}.".format(adj.shape))
        print("Features matrix shape: {}.".format(features.shape))
        print("Graph size: {}.".format(len(graph)))
        print("Max degree: {}.\n".format(max_degree))

        dataset = [adj, features, y_train, y_test, train_mask, test_mask]
        prepared_test_data = self._prepare_test_data(dataset, max_degree)
        return prepared_test_data, max_degree

    def _prepare_test_data(self, dataset, max_degree):
        print("Preparing test data...")
        adj, features, y_train, y_test, train_mask, test_mask = dataset
        train_index = np.where(train_mask)[0]
        adj_train = adj[train_index, :][:, train_index]
        y_train = y_train[train_index]
        test_index = np.where(test_mask)[0]
        y_test = y_test[test_index]

        num_train = adj_train.shape[0]
        input_dim = features.shape[1]

        features = nontuple_preprocess_features(features).todense()
        train_features = features[train_index]

        norm_adj_train = nontuple_preprocess_adj(adj_train)
        norm_adj = nontuple_preprocess_adj(adj)

        adj_train, adj_val_train = compute_adjlist(norm_adj_train, max_degree)
        train_features = np.concatenate(
            (train_features, np.zeros((1, input_dim))))
        print("Prepared.\n")
        return norm_adj, adj_train, adj_val_train, features, train_features, y_train, y_test, test_index

    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for data preprocessing.')
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('dataset',
                            help='Name of the object file that stores the ' +
                            'training data.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        args = parser.parse_args()
        print("Starting...")
        from preprocess_data import Processor
        processor = Processor(args.embedding_type, args.dataset, args.gpu)
        processor.training_data()
        print("Finished.")

    if __name__ == "__main__":
        main()

Пример #16

Показать файл

Файл: gat_preprocess_data.py Проект: andreeaiana/graph_confrec

class Processor:
    def __init__(self,
                 embedding_type,
                 dataset,
                 graph_type="directed",
                 threshold=2,
                 gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.dataset = dataset
        self.graph_type = graph_type
        self.threshold = threshold
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "gat", self.embedding_type, self.dataset)
        if not os.path.exists(self.path_persistent):
            os.makedirs(self.path_persistent)

    def training_data(self):
        self.timer.tic()
        print("Creating training files.\n")

        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)

        # Create file with feature vectors for both training and validation
        # data (as a scipy.sparse.csr.csr_matrix object)
        print("Creating feature vectors for training and validation data.")
        train_val_features = self._create_features(train_val_data)
        print("Created.")
        print("Saving to disk...")
        allx_file = os.path.join(self.path_persistent,
                                 "ind." + self.dataset + ".allx")
        with open(allx_file, "wb") as f:
            pickle.dump(train_val_features, f)
        print("Saved.\n")

        # Create file with feature vectors only for training data
        # (as a scipy.sparse.csr.csr_matrix object)
        print("Creating feature vectors for training data.")
        train_features = train_val_features[:len(df_train)]
        print("Created.")
        print("Saving to disk...")
        x_file = os.path.join(self.path_persistent,
                              "ind." + self.dataset + ".x")
        with open(x_file, "wb") as f:
            pickle.dump(train_features, f)
        print("Saved.\n")

        # Create file with the labels for the training and validation data
        # (as a numpy.ndarray object)
        print("Creating labels for training and validation data.")
        self._train_label_encoder(train_val_data)
        train_val_labels = self.label_encoder.transform(
            np.array(train_val_data.conferenceseries).reshape(-1, 1))
        print("Created")
        print("Saving to disk...")
        ally_file = os.path.join(self.path_persistent,
                                 "ind." + self.dataset + ".ally")
        with open(ally_file, "wb") as f:
            pickle.dump(train_val_labels, f)
        print("Saved.\n")

        # Create file with the labels for the training data
        # (as a numpy.ndarray object)
        print("Creating labels for training data.")
        train_labels = train_val_labels[:len(df_train)]
        print("Created.")
        print("Saving to disk...")
        y_file = os.path.join(self.path_persistent,
                              "ind." + self.dataset + ".y")
        with open(y_file, "wb") as f:
            pickle.dump(train_labels, f)
        print("Saved.\n")

        # Create a dict in the format {index: [index_of_neighbor_nodes]}
        # (as a collections.defaultdict object)
        if self.dataset == "citations":
            if self.graph_type == "directed":
                graph = self._create_directed_graph(train_val_data)
            else:
                graph = self._create_undirected_graph(train_val_data)
        if self.dataset == "citations_authors_het_edges":
            df_train_authors = d_train.author_names().data
            df_val_authors = d_val.author_names().data
            train_val_authors_data = pd.concat(
                (df_train_authors, df_val_authors),
                axis=0).reset_index(drop=True)
            data_authors = train_val_authors_data.groupby(
                "author_name")["chapter"].agg(list).reset_index()
            if self.graph_type == "directed":
                graph = self._create_heterogeneous_directed_graph(
                    train_val_data, data_authors)
            else:
                raise ValueError("Graph type incompatible. Only directed " +
                                 "graph is suported.")
        print("Finished creating training files.\n")

        print("Statistics")
        print("\tTraining data features: {}.".format(train_features.shape))
        print("\tTraining data labels: {}.".format(len(train_labels)))
        print("\tTraining and validation data features: {}.".format(
            train_val_features.shape))
        print("\tTraining and validation data labels: {}.".format(
            len(train_val_labels)))
        print("\tGraph size: {}.".format(len(graph)))
        print("\tMax node degree: {}.".format(len(max(graph.values(),
                                                      key=len))))

    def _create_directed_graph(self, train_val_data):
        print("Creating dictionary of neighbours.")
        graph = defaultdict(list)
        with tqdm(desc="Adding neighbours: ",
                  total=len(train_val_data)) as pbar:
            for idx in range(len(train_val_data)):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in train_val_data.chapter_citations.iloc[idx]
                ]
                graph[idx] = list(set([i[0] for i in citations_indices if i]))
                pbar.update(1)
        print("Created.")
        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent,
                                  "ind." + self.dataset + ".graph_directed")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")
        return graph

    def _create_heterogeneous_directed_graph(self, train_val_data,
                                             data_authors):
        print("Creating dictionary of neighbours.")
        graph = defaultdict(list)
        # Add citation edges between papers
        with tqdm(desc="Adding citation neighbours: ",
                  total=len(train_val_data)) as pbar:
            for idx in range(len(train_val_data)):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in train_val_data.chapter_citations.iloc[idx]
                ]
                graph[idx] = [(i[0], 100) for i in citations_indices if i]
                pbar.update(1)

        # Add edges between papers if they share an author
        with tqdm(desc="Adding author neighbours: ",
                  total=len(data_authors)) as pbar:
            for idx in range(len(data_authors)):
                authors_indices = [
                    train_val_data[train_val_data.chapter ==
                                   paper].index.tolist()
                    for paper in data_authors.chapter.iloc[idx]
                ]
                authors_indices = [i[0] for i in authors_indices if i]
                edges = [i for i in combinations(authors_indices, 2)]
                for edge in edges:
                    graph[edge[0]].append((edge[1], 1))
                pbar.update(1)

        # Removed edges with weights below the threshold
        for key in graph.keys():
            d = defaultdict(int)
            for x, y in graph[key]:
                d[x] += y
            graph[key] = [k for k, v in d.items() if v >= self.threshold]
        print("Created.")

        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent,
                                  "ind." + self.dataset + ".graph_directed")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")
        return graph

    def _create_undirected_graph(self, train_val_data):
        print("Creating dictionary of neighbours.")
        graph = defaultdict(list)
        with tqdm(desc="Adding neighbours: ",
                  total=len(train_val_data)) as pbar:
            for idx in range(len(train_val_data)):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in train_val_data.chapter_citations.iloc[idx]
                ]
                neighbours = [c[0] for c in citations_indices if c]
                graph[idx].extend(neighbours)
                for node in neighbours:
                    graph[node].append(idx)
                pbar.update(1)
        with tqdm(desc="Removing duplicates: ",
                  total=len(graph.keys())) as pbar:
            for idx in range(len(graph.keys())):
                graph[idx] = list(set(graph[idx]))
                pbar.update(1)
        print("Created.")
        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent,
                                  "ind." + self.dataset + ".graph")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")
        return graph

    def _create_features(self, data):
        features = []
        with tqdm(desc="Creating features: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                features.append(
                    np.concatenate((self.embeddings_parser.embed_sequence(
                        data.chapter_title.iloc[idx], self.embedding_type),
                                    self.embeddings_parser.embed_sequence(
                                        data.chapter_abstract.iloc[idx],
                                        self.embedding_type)),
                                   axis=0).tolist())
                pbar.update(1)
        return sp.csr.csr_matrix(np.array(features))

    def _train_label_encoder(self, data):
        self.label_encoder = OneHotEncoder(handle_unknown='ignore',
                                           sparse=False,
                                           dtype=np.int)
        labels = data.conferenceseries.unique()
        labels = labels.reshape(-1, 1)
        self.label_encoder.fit(labels)
        with open(os.path.join(self.path_persistent, "label_encoder.pkl"),
                  "wb") as f:
            pickle.dump(self.label_encoder, f)

    def _update_directed_graph(self, graph, train_val_data, df_test):
        with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar:
            for idx in list(df_test.index):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in df_test.chapter_citations.loc[idx]
                ]
                graph[idx] = list(set([i[0] for i in citations_indices if i]))
                pbar.update(1)
        return graph

    def _update_heterogeneous_directed_graph(self, graph, train_val_data,
                                             df_test, data_authors):
        with tqdm(desc="Adding citation neighbours: ",
                  total=len(df_test)) as pbar:
            for idx in list(df_test.index):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in df_test.chapter_citations.loc[idx]
                ]
                graph[idx] = [(i[0], 100) for i in citations_indices if i]
                pbar.update(1)

        with tqdm(desc="Adding author neighbours: ",
                  total=len(data_authors)) as pbar:
            for idx in range(len(data_authors)):
                authors_indices = [
                    train_val_data[train_val_data.chapter ==
                                   paper].index.tolist()
                    for paper in data_authors.chapter.iloc[idx]
                ]
                authors_indices = [i[0] for i in authors_indices if i]
                edges = [i for i in combinations(authors_indices, 2)]
                for edge in edges:
                    graph[edge[0]].append((edge[1], 1))
                pbar.update(1)

        for key in graph.keys():
            d = defaultdict(int)
            for e in reversed(graph[key]):
                if type(e) is tuple:
                    if e[0] in d.keys():
                        d[e[0]] += e[1]
                    else:
                        d[e[0]] = e[1]
                graph[key].remove(e)
            graph[key].extend([k for k, v in d.items() if v >= self.threshold])

        return graph

    def _update_undirected_graph(self, graph, train_val_data, df_test):
        with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar:
            for idx in list(df_test.index):
                citations_indices = [
                    train_val_data[train_val_data.chapter ==
                                   citation].index.tolist()
                    for citation in df_test.chapter_citations.loc[idx]
                ]
                neighbours = [c[0] for c in citations_indices if c]
                graph[idx].extend(neighbours)
                for node in neighbours:
                    graph[node].append(idx)
                pbar.update(1)
        with tqdm(desc="Removing duplicates: ",
                  total=len(graph.keys())) as pbar:
            for idx in range(len(graph.keys())):
                graph[idx] = list(set(graph[idx]))
                pbar.update(1)
        return graph

    def test_data(self,
                  df_test,
                  train_features,
                  train_labels,
                  train_val_features,
                  train_val_labels,
                  graph,
                  authors_df=None):
        print("Preprocessing data...")
        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)

        # Create the indices of test instances in graph (as a list object)
        test_indices = list(df_test.index)

        # Create "fake" temporary labels for test data
        test_labels = np.zeros((len(df_test), len(train_val_labels[0])),
                               dtype=int)

        # Update graph with test data
        print("Updating graph information...")
        if self.dataset == "citations":
            if self.graph_type == "directed":
                graph = self._update_directed_graph(graph, train_val_data,
                                                    df_test)
            else:
                graph = self._update_undirected_graph(graph, train_val_data,
                                                      df_test)
        if self.dataset == "citations_authors_het_edges":
            data_authors = authors_df.groupby("author_name")["chapter"].agg(
                list).reset_index()
            if self.graph_type == "directed":
                graph = self._update_heterogeneous_directed_graph(
                    graph, train_val_data, df_test, data_authors)
            else:
                raise ValueError("Graph type incompatible. Only directed " +
                                 "graph is suported.")
        print("Updated.")

        # Create feature vectors of test instances
        print("Creating features for test data...")
        test_features = self._create_features(df_test)
        print("Created.")

        test_idx_range = np.sort(test_indices)
        features = sp.vstack((train_val_features, test_features)).tolil()
        features[test_indices, :] = features[test_idx_range, :]
        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

        labels = np.vstack((train_val_labels, test_labels))
        labels[test_indices, :] = labels[test_idx_range, :]

        idx_test = test_idx_range.tolist()
        idx_train = range(len(train_labels))
        idx_val = range(len(train_labels), len(train_val_labels))

        train_mask = sample_mask(idx_train, labels.shape[0])
        val_mask = sample_mask(idx_val, labels.shape[0])
        test_mask = sample_mask(idx_test, labels.shape[0])

        y_train = np.zeros(labels.shape)
        y_val = np.zeros(labels.shape)
        y_test = np.zeros(labels.shape)
        y_train[train_mask, :] = labels[train_mask, :]
        y_val[val_mask, :] = labels[val_mask, :]
        y_test[test_mask, :] = labels[test_mask, :]
        print("Finished preprocessing data.")

        print("Adjacency matrix shape: {}.".format(adj.shape))
        print("Features matrix shape: {}.".format(features.shape))
        print("Graph size: {}.".format(len(graph)))

        return adj, features, y_train, y_test, train_mask, test_mask

    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for data preprocessing.')
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('dataset',
                            help='Name of the object file that stores the ' +
                            'training data.')
        parser.add_argument('--graph_type',
                            choices=["directed", "undirected"],
                            default="directed",
                            help='The type of graph used ' +
                            '(directed vs. undirected).')
        parser.add_argument('--threshold',
                            type=int,
                            default=2,
                            help='Threshold for edge weights in ' +
                            'heterogeneous graph.')
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        args = parser.parse_args()
        print("Starting...")
        from preprocess_data import Processor
        processor = Processor(args.embedding_type, args.dataset,
                              args.graph_type, args.threshold, args.gpu)
        processor.training_data()
        print("Finished.")

    if __name__ == "__main__":
        main()

Пример #17

Показать файл

Файл: han_preprocess_data.py Проект: andreeaiana/graph_confrec

class Processor:
    def __init__(self, embedding_type, gpu=0):
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

        self.embedding_type = embedding_type
        self.embeddings_parser = EmbeddingsParser(gpu)
        self.timer = Timer()
        self.path_persistent = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..",
            "data", "interim", "han", self.embedding_type)
        if not os.path.exists(self.path_persistent):
            os.makedirs(self.path_persistent)

    def training_data(self):
        self.timer.tic()
        print("Creating training files.\n")

        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)

        print("Creating index files for training and validation data.")
        train_idx = np.asarray(list(train_val_data.index))[:len(df_train)]
        train_idx = np.asarray([train_idx])
        val_idx = np.asarray(list(train_val_data.index))[len(df_train):]
        val_idx = np.asarray([val_idx])
        print("Created.")
        print("Saving to disk...")
        train_idx_file = os.path.join(self.path_persistent, "train_idx.pkl")
        val_idx_file = os.path.join(self.path_persistent, "val_idx.pkl")
        with open(train_idx_file, "wb") as f:
            pickle.dump(train_idx, f)
        with open(val_idx_file, "wb") as f:
            pickle.dump(val_idx, f)
        print("Saved.")

        print("Creating labels for training and validation data.")
        self._train_label_encoder(train_val_data)
        train_val_labels = self.label_encoder.transform(
            np.array(train_val_data.conferenceseries).reshape(-1, 1))
        print("Created")
        print("Saving to disk...")
        labels_file = os.path.join(self.path_persistent, "labels.pkl")
        with open(labels_file, "wb") as f:
            pickle.dump(train_val_labels, f)
        print("Saved.\n")

        print("Creating feature vectors for training and validation data.")
        train_val_features = self._create_features(train_val_data)
        print("Created.")
        print("Saving to disk...")
        features_file = os.path.join(self.path_persistent, "features.pkl")
        with open(features_file, "wb") as f:
            pickle.dump(train_val_features, f)
        print("Saved.\n")

        df_train_authors = d_train.author_names().data
        df_val_authors = d_val.author_names().data
        train_val_authors_data = pd.concat((df_train_authors, df_val_authors),
                                           axis=0).reset_index(drop=True)
        data_authors = train_val_authors_data.groupby(
            "author_name")["chapter"].agg(list).reset_index()

        print("Creating adjacency matrices...")
        PCP = self._create_PCP_adjacency(train_val_data)
        PAP = self._create_PAP_adjacency(train_val_data, data_authors)
        print("Created.")

        print("Finished creating training files.\n")

        print("Statistics")
        print("\tTraining and validation data features: {}.".format(
            train_val_features.shape))
        print("\tTraining and validation data labels: {}.".format(
            train_val_labels.shape))
        print("\tPCP graph size: {}.".format(len(PCP)))
        print("\tMax node degree: {}.".format(len(max(PCP.values(), key=len))))
        print("\tPAP graph size: {}.".format(len(PAP)))
        print("\tMax node degree: {}.".format(len(max(PAP.values(), key=len))))

    def test_data(self, df_test, authors_df, train_idx, features, labels, PCP,
                  PAP):
        print("Preprocessing data...")
        # Load training and validation data
        d_train = DataLoader()
        df_train = d_train.training_data_with_abstracts_citations().data

        d_val = DataLoader()
        df_validation = d_val.validation_data_with_abstracts_citations().data

        train_val_data = pd.concat((df_train, df_validation),
                                   axis=0).reset_index(drop=True)
        data_authors = authors_df.groupby("author_name")["chapter"].agg(
            list).reset_index()

        # Create the indices of test instances in graph (as a list object)
        test_idx = np.asarray(list(df_test.index))
        test_idx = np.asarray([test_idx])

        # Create "fake" temporary labels for test data
        test_labels = np.zeros((len(df_test), len(labels[0])), dtype=int)
        labels = np.vstack((labels, test_labels))

        train_mask = sample_mask(train_idx, labels.shape[0])
        test_mask = sample_mask(test_idx, labels.shape[0])
        y_train = np.zeros(labels.shape)
        y_test = np.zeros(labels.shape)
        y_train[train_mask, :] = labels[train_mask, :]
        y_test[test_mask, :] = labels[test_mask, :]

        # Update graph with test data
        print("Updating graph information...")
        PCP_graph = self._update_PCP_adjacency(PCP, train_val_data, df_test)
        PAP_graph = self._update_PAP_adjacency(PAP, train_val_data, df_test,
                                               data_authors)
        print("Updated.")
        PAP = nx.adjacency_matrix(nx.from_dict_of_lists(PAP_graph))
        PCP = nx.adjacency_matrix(nx.from_dict_of_lists(PCP_graph))
        row_networks = [PCP, PAP]
        print("PCP: {}; PAP: {}".format(PCP.shape, PAP.shape))

        # Create feature vectors of test instances
        print("Creating features for test data...")
        test_features = self._create_features(df_test)
        features = np.vstack((features, test_features))
        print("Features: {}".format(features.shape))
        print("Created.")

        print("Finished preprocessing data.")
        print("y_train: {}, y_test: {}, train_idx: {}, test_idx: {}".format(
            y_train.shape, y_test.shape, train_idx.shape, test_idx.shape))

        features_list = [features, features, features]
        return row_networks, features_list, y_train, y_test, train_mask, test_mask

    def _create_PCP_adjacency(self, data):
        print("Creating paper-citation-paper adjacency lists.")
        graph = defaultdict(list)
        with tqdm(desc="Adding neighbours: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                citations_indices = [
                    data[data.chapter == citation].index.tolist()
                    for citation in data.chapter_citations.iloc[idx]
                ]
                graph[idx] = list(set([i[0] for i in citations_indices if i]))
                pbar.update(1)
        print("Created.")
        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent, "PCP.pkl")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")
        return graph

    def _update_PCP_adjacency(self, graph, data, df_test):
        print("Updating paper-citation-paper adjacency lists.")
        with tqdm(desc="Adding neighbours: ", total=len(df_test)) as pbar:
            for idx in list(df_test.index):
                citations_indices = [
                    data[data.chapter == citation].index.tolist()
                    for citation in df_test.chapter_citations.loc[idx]
                ]
                graph[idx] = list(set([i[0] for i in citations_indices if i]))
                pbar.update(1)
        print("Updated.")
        return graph

    def _create_PAP_adjacency(self, data, data_authors):
        print("Creating paper-author-paper adjacency lists.")
        graph = defaultdict()
        for idx in data.index:
            graph[idx] = []
        # Add edges between papers if they share an author
        with tqdm(desc="Adding neighbours: ", total=len(data_authors)) as pbar:
            for idx in range(len(data_authors)):
                authors_indices = [
                    data[data.chapter == paper].index.tolist()
                    for paper in data_authors.chapter.iloc[idx]
                ]
                authors_indices = [i[0] for i in authors_indices if i]
                edges = [i for i in combinations(authors_indices, 2)]
                for edge in edges:
                    graph[edge[0]].append(edge[1])
                pbar.update(1)
        print("Created.")
        print("Saving to disk...")
        graph_file = os.path.join(self.path_persistent, "PAP.pkl")
        with open(graph_file, "wb") as f:
            pickle.dump(graph, f)
        print("Saved.\n")
        return graph

    def _update_PAP_adjacency(self, graph, data, df_test, data_authors):
        print("Updating paper-author-paper adjacency lists.")
        for idx in df_test.index:
            graph[idx] = []
        with tqdm(desc="Adding neighbours: ", total=len(data_authors)) as pbar:
            for idx in range(len(data_authors)):
                authors_indices = [
                    data[data.chapter == paper].index.tolist()
                    for paper in data_authors.chapter.iloc[idx]
                ]
                authors_indices = [i[0] for i in authors_indices if i]
                edges = [i for i in combinations(authors_indices, 2)]
                for edge in edges:
                    graph[edge[0]].append(edge[1])
                pbar.update(1)
        print("Updated.")
        return graph

    def _create_features(self, data):
        features = []
        with tqdm(desc="Creating features: ", total=len(data)) as pbar:
            for idx in range(len(data)):
                features.append(
                    np.concatenate((self.embeddings_parser.embed_sequence(
                        data.chapter_title.iloc[idx], self.embedding_type),
                                    self.embeddings_parser.embed_sequence(
                                        data.chapter_abstract.iloc[idx],
                                        self.embedding_type)),
                                   axis=0).tolist())
                pbar.update(1)
        return np.asarray(features)

    def _train_label_encoder(self, data):
        self.label_encoder = OneHotEncoder(handle_unknown='ignore',
                                           sparse=False,
                                           dtype=np.int)
        labels = data.conferenceseries.unique()
        labels = labels.reshape(-1, 1)
        self.label_encoder.fit(labels)
        with open(os.path.join(self.path_persistent, "label_encoder.pkl"),
                  "wb") as f:
            pickle.dump(self.label_encoder, f)

    def main():
        parser = argparse.ArgumentParser(
            description='Arguments for data preprocessing.')
        parser.add_argument('embedding_type',
                            choices=[
                                "AVG_L", "AVG_2L", "AVG_SUM_L4", "AVG_SUM_ALL",
                                "MAX_2L", "CONC_AVG_MAX_2L",
                                "CONC_AVG_MAX_SUM_L4", "SUM_L", "SUM_2L"
                            ],
                            help="Type of embedding.")
        parser.add_argument('--gpu',
                            type=int,
                            default=0,
                            help='Which gpu to use.')
        args = parser.parse_args()
        print("Starting...")
        from preprocess_data import Processor
        processor = Processor(args.embedding_type, args.gpu)
        processor.training_data()
        print("Finished.")

    if __name__ == "__main__":
        main()

Пример #18

Показать файл

    def predict(self, test_data, model_checkpoint, gpu_mem_fraction=None):
        timer = Timer()
        timer.tic()

        G = test_data[0]
        features = test_data[1]
        id_map = test_data[2]

        if features is not None:
            # pad with dummy zero vector
            features = np.vstack([features, np.zeros((features.shape[1], ))])

        context_pairs = test_data[3] if self.random_context else None
        placeholders = self._construct_placeholders()
        minibatch = EdgeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          batch_size=self.batch_size,
                                          max_degree=self.max_degree,
                                          num_neg_samples=self.neg_sample_size,
                                          context_pairs=context_pairs)

        adj_info_ph = tf.compat.v1.placeholder(tf.int32,
                                               shape=minibatch.adj.shape)
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")

        model = self._create_model(placeholders, features, adj_info, minibatch)

        config = tf.compat.v1.ConfigProto(
            log_device_placement=self.log_device_placement)
        if gpu_mem_fraction is not None:
            config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_fraction
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        # Initialize session
        sess = tf.compat.v1.Session(config=config)
        merged = tf.compat.v1.summary.merge_all()
        #        summary_writer = tf.compat.v1.summary.FileWriter(self._log_dir(),
        #                                                         sess.graph)

        # Initialize model saver
        saver = tf.compat.v1.train.Saver()

        # Init variables
        sess.run(tf.compat.v1.global_variables_initializer(),
                 feed_dict={adj_info_ph: minibatch.adj})

        val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj)

        # Restore model
        print("Restoring trained model.")
        checkpoint_file = os.path.join(self._log_dir(), model_checkpoint)
        ckpt = tf.compat.v1.train.get_checkpoint_state(checkpoint_file)
        if checkpoint_file:
            saver.restore(sess, checkpoint_file)
            print("Model restored.")
        else:
            print("This model checkpoint does not exist. The model might " +
                  "not be trained yet or the checkpoint is invalid.")

        # Infer embeddings
        sess.run(val_adj_info.op)
        print("Computing embeddings...")
        val_embeddings = []
        finished = False
        seen = set([])
        nodes = []
        iter_num = 0
        while not finished:
            feed_dict_val, finished, edges = minibatch.incremental_embed_feed_dict(
                self.validate_batch_size, iter_num)
            iter_num += 1
            outs_val = sess.run([model.loss, model.mrr, model.outputs1],
                                feed_dict=feed_dict_val)
            for i, edge in enumerate(edges):
                if not edge[0] in seen:
                    val_embeddings.append(outs_val[-1][i, :])
                    nodes.append(edge[0])
                    seen.add(edge[0])

        val_embeddings = np.vstack(val_embeddings)
        if self.save_embeddings:
            print("Saving embeddings...")
            if not os.path.exists(self._log_dir()):
                os.makedirs(self._log_dir())
            np.save(self._log_dir() + "inferred_embeddings.npy",
                    val_embeddings)
            with open(self._log_dir() + "inferred_embeddings_ids.txt",
                      "w") as fp:
                fp.write("\n".join(map(str, nodes)))
            print("Embeddings saved.\n")

        # Return only the embeddings of the test nodes
        test_embeddings_ids = {}
        for i, node in enumerate(nodes):
            test_embeddings_ids[node] = i
        test_nodes = [n for n in G.nodes() if G.node[n]['test']]
        test_embeddings = val_embeddings[[
            test_embeddings_ids[id] for id in test_nodes
        ]]

        sess.close()
        tf.compat.v1.reset_default_graph()
        timer.toc()
        return test_nodes, test_embeddings

Пример #19

Показать файл

    def train(self):
        print("Loading data...")
        adj_list, features_list, y_train, y_val, train_mask, val_mask = load_data(
            self.embedding_type)
        print("Loaded.")

        nb_nodes = features_list[0].shape[0]
        ft_size = features_list[0].shape[1]
        nb_classes = y_train.shape[1]

        features_list = [features[np.newaxis] for features in features_list]
        y_train = y_train[np.newaxis]
        y_val = y_val[np.newaxis]
        train_mask = train_mask[np.newaxis]
        val_mask = val_mask[np.newaxis]
        biases_list = [preprocess_adj_bias(adj) for adj in adj_list]

        print("Training model...")
        timer = Timer()
        timer.tic()

        print(
            "Parameters: batch size={}, nb_nodes={}, ft_size={}, nb_classes={}\n"
            .format(self.batch_size, nb_nodes, ft_size, nb_classes))

        model = HAN(self.model,
                    self.hid_units,
                    self.n_heads,
                    nb_classes,
                    nb_nodes,
                    l2_coef=self.weight_decay,
                    ffd_drop=self.ffd_drop,
                    attn_drop=self.attn_drop,
                    activation=self.nonlinearity,
                    residual=self.residual)

        vlss_mn = np.inf
        vacc_mx = 0.0
        curr_step = 0

        train_loss_avg = 0
        train_acc_avg = 0
        val_loss_avg = 0
        val_acc_avg = 0

        train_losses = []
        val_losses = []
        train_accuracies = []
        val_accuracies = []

        for epoch in range(self.epochs):
            print("\nEpoch {}".format(epoch))

            # Training
            tr_step = 0
            tr_size = features_list[0].shape[0]
            while tr_step * self.batch_size < tr_size:
                feats_list = [
                    features[tr_step * self.batch_size:(tr_step + 1) *
                             self.batch_size] for features in features_list
                ]

                _, train_embed, att_val, acc_tr, loss_value_tr = self._train(
                    model=model,
                    inputs_list=feats_list,
                    bias_mat_list=biases_list,
                    lbl_in=y_train[tr_step * self.batch_size:(tr_step + 1) *
                                   self.batch_size],
                    msk_in=train_mask[tr_step * self.batch_size:(tr_step + 1) *
                                      self.batch_size])

                train_loss_avg += loss_value_tr
                train_acc_avg += acc_tr
                tr_step += 1

            # Validation
            vl_step = 0
            vl_size = features_list[0].shape[0]

            while vl_step * self.batch_size < vl_size:
                feats_list = [
                    features[vl_step * self.batch_size:(vl_step + 1) *
                             self.batch_size] for features in features_list
                ]

                _, val_embed, att_val, acc_vl, loss_value_vl = self.evaluate(
                    model=model,
                    inputs_list=feats_list,
                    bias_mat_list=biases_list,
                    lbl_in=y_val[vl_step * self.batch_size:(vl_step + 1) *
                                 self.batch_size],
                    msk_in=val_mask[vl_step * self.batch_size:(vl_step + 1) *
                                    self.batch_size])

                val_loss_avg += loss_value_vl
                val_acc_avg += acc_vl
                vl_step += 1

            print(
                'Training: loss = %.5f, acc = %.5f | Val: loss = %.5f, acc = %.5f'
                % (train_loss_avg / tr_step, train_acc_avg / tr_step,
                   val_loss_avg / vl_step, val_acc_avg / vl_step))
            train_losses.append(train_loss_avg / tr_step)
            val_losses.append(val_loss_avg / vl_step)
            train_accuracies.append(train_acc_avg / tr_step)
            val_accuracies.append(val_acc_avg / vl_step)

            # Early Stopping
            if val_acc_avg / vl_step >= vacc_mx or val_loss_avg / vl_step <= vlss_mn:
                if val_acc_avg / vl_step >= vacc_mx and val_loss_avg / vl_step <= vlss_mn:
                    vacc_early_model = val_acc_avg / vl_step
                    vlss_early_model = val_loss_avg / vl_step
                    working_weights = model.get_weights()
                    print(
                        "Minimum validation loss ({}), maximum accuracy ({}) so far  at epoch {}."
                        .format(val_loss_avg / vl_step, val_acc_avg / vl_step,
                                epoch))
                    self._save_model(model)
                vacc_mx = np.max((val_acc_avg / vl_step, vacc_mx))
                vlss_mn = np.min((val_loss_avg / vl_step, vlss_mn))
                curr_step = 0
            else:
                curr_step += 1
                if curr_step == self.patience:
                    print("Early stop! Min loss: {}, Max accuracy: {}".format(
                        vlss_mn, vacc_mx))
                    print("Early stop model validation loss: {}, accuracy: {}".
                          format(vlss_early_model, vacc_early_model))
                    model.set_weights(working_weights)
                    break

            train_loss_avg = 0
            train_acc_avg = 0
            val_loss_avg = 0
            val_acc_avg = 0

        print("Training finished.")

        training_time = timer.toc()
        train_losses = [x.numpy() for x in train_losses]
        val_losses = [x.numpy() for x in val_losses]
        train_accuracies = [x.numpy() for x in train_accuracies]
        val_accuracies = [x.numpy() for x in val_accuracies]
        self._plot_losses(train_losses, val_losses)
        self._plot_accuracies(train_accuracies, val_accuracies)
        self._print_stats(train_losses, val_losses, train_accuracies,
                          val_accuracies, training_time)

Пример #20

Показать файл

Файл: unsupervised_model.py Проект: andreeaiana/graph_confrec

    def train(self, train_data, sampler_name='Uniform'):
        print("Training model...")
        timer = Timer()
        timer.tic()

        G = train_data[0]
        features = train_data[1]
        id_map = train_data[2]

        if features is not None:
            # pad with dummy zero vector
            features = np.vstack([features, np.zeros((features.shape[1], ))])

        context_pairs = train_data[3] if self.random_context else None
        placeholders = self._construct_placeholders()
        minibatch = EdgeMinibatchIterator(G,
                                          id_map,
                                          placeholders,
                                          batch_size=self.batch_size,
                                          max_degree=self.max_degree,
                                          num_neg_samples=self.neg_sample_size,
                                          context_pairs=context_pairs)

        adj_info_ph = tf.compat.v1.placeholder(tf.int32,
                                               shape=minibatch.adj.shape)
        adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")
        adj_shape = adj_info.get_shape().as_list()

        model = self._create_model(sampler_name, placeholders, features,
                                   adj_info, minibatch)

        config = tf.compat.v1.ConfigProto(
            log_device_placement=self.log_device_placement)
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        # Initialize session
        sess = tf.compat.v1.Session(config=config)
        merged = tf.compat.v1.summary.merge_all()
        #        summary_writer = tf.compat.v1.summary.FileWriter(
        #                self._log_dir(sampler_name), sess.graph)

        # Initialize model saver
        saver = tf.compat.v1.train.Saver(max_to_keep=self.epochs)

        # Init variables
        sess.run(tf.compat.v1.global_variables_initializer(),
                 feed_dict={adj_info_ph: minibatch.adj})

        # Restore params of ML sampler model
        if sampler_name == 'ML' or sampler_name == 'FastML':
            sampler_vars = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="MLsampler")
            saver_sampler = tf.compat.v1.train.Saver(var_list=sampler_vars)
            sampler_model_path = self._sampler_model_path()
            saver_sampler.restore(sess, sampler_model_path + 'model.ckpt')

        # Loss node path
        loss_node_path = self._loss_node_path(sampler_name)
        if not os.path.exists(loss_node_path):
            os.makedirs(loss_node_path)

        # Train model
        train_shadow_mrr = None
        shadow_mrr = None

        total_steps = 0
        avg_time = 0.0
        epoch_val_costs = []

        train_adj_info = tf.compat.v1.assign(adj_info, minibatch.adj)
        val_adj_info = tf.compat.v1.assign(adj_info, minibatch.test_adj)

        train_losses = []
        validation_losses = []

        val_cost_ = []
        val_mrr_ = []
        shadow_mrr_ = []
        duration_ = []

        ln_acc = sparse.csr_matrix((adj_shape[0], adj_shape[0]),
                                   dtype=np.float32)
        lnc_acc = sparse.csr_matrix((adj_shape[0], adj_shape[0]),
                                    dtype=np.int32)
        ln_acc = ln_acc.tolil()
        lnc_acc = lnc_acc.tolil()

        for epoch in range(self.epochs):
            minibatch.shuffle()

            iter = 0
            print('Epoch: %04d' % (epoch))
            epoch_val_costs.append(0)
            train_loss_epoch = []
            validation_loss_epoch = []

            while not minibatch.end():
                # Construct feed dictionary
                feed_dict = minibatch.next_minibatch_feed_dict()
                feed_dict.update({placeholders['dropout']: self.dropout})
                t = time.time()

                # Training step
                outs = sess.run([
                    merged, model.opt_op, model.loss, model.ranks,
                    model.aff_all, model.mrr, model.outputs1, model.loss_node,
                    model.loss_node_count
                ],
                                feed_dict=feed_dict)
                train_cost = outs[2]
                train_mrr = outs[5]
                train_loss_epoch.append(train_cost)

                if train_shadow_mrr is None:
                    train_shadow_mrr = train_mrr
                else:
                    train_shadow_mrr -= (1 - 0.99) * (train_shadow_mrr -
                                                      train_mrr)

                if iter % self.validate_iter == 0:
                    # Validation
                    sess.run(val_adj_info.op)
                    val_cost, ranks, val_mrr, duration = self._evaluate(
                        sess, model, minibatch, size=self.validate_batch_size)
                    sess.run(train_adj_info.op)
                    epoch_val_costs[-1] += val_cost
                    validation_loss_epoch.append(val_cost)

                if shadow_mrr is None:
                    shadow_mrr = val_mrr
                else:
                    shadow_mrr -= (1 - 0.99) * (shadow_mrr - val_mrr)

                val_cost_.append(val_cost)
                val_mrr_.append(val_mrr)
                shadow_mrr_.append(shadow_mrr)
                duration_.append(duration)

                #                if total_steps % self.print_every == 0:
                #                    summary_writer.add_summary(outs[0], total_steps)

                # Print results
                avg_time = (avg_time * total_steps + time.time() -
                            t) / (total_steps + 1)

                if total_steps % self.print_every == 0:
                    print(
                        "Iter: %04d" % iter,
                        "train_loss={:.5f}".format(train_cost),
                        "train_mrr={:.5f}".format(train_mrr),
                        # exponential moving average
                        "train_mrr_ema={:.5f}".format(train_shadow_mrr),
                        "val_loss={:.5f}".format(val_cost),
                        "val_mrr={:.5f}".format(val_mrr),
                        # exponential moving average
                        "val_mrr_ema={:.5f}".format(shadow_mrr),
                        "time={:.5f}".format(avg_time))

                ln = outs[7].values
                ln_idx = outs[7].indices
                ln_acc[ln_idx[:, 0], ln_idx[:, 1]] += ln

                lnc = outs[8].values
                lnc_idx = outs[8].indices
                lnc_acc[lnc_idx[:, 0], lnc_idx[:, 1]] += lnc

                iter += 1
                total_steps += 1

                if total_steps > self.max_total_steps:
                    break

            # Keep track of train and validation losses per epoch
            train_losses.append(sum(train_loss_epoch) / len(train_loss_epoch))
            validation_losses.append(
                sum(validation_loss_epoch) / len(validation_loss_epoch))

            # If the epoch has the lowest validation loss so far
            if validation_losses[-1] == min(validation_losses):
                print(
                    "Minimum validation loss so far ({}) at epoch {}.".format(
                        validation_losses[-1], epoch))
                # Save loss node and count
                loss_node = sparse.save_npz(loss_node_path + 'loss_node.npz',
                                            sparse.csr_matrix(ln_acc))
                loss_node_count = sparse.save_npz(
                    loss_node_path + 'loss_node_count.npz',
                    sparse.csr_matrix(lnc_acc))
                # Save embeddings
                if self.save_embeddings and sampler_name is not "Uniform":
                    sess.run(val_adj_info.op)
                    self._save_embeddings(sess, model, minibatch,
                                          self.validate_batch_size,
                                          self._log_dir(sampler_name))

            # Save model at each epoch
            print("Saving model at epoch {}.".format(epoch))
            saver.save(
                sess,
                os.path.join(self._log_dir(sampler_name),
                             "model_epoch_" + str(epoch) + ".ckpt"))

            if total_steps > self.max_total_steps:
                break

        print("Optimization Finished!")

        training_time = timer.toc()
        self._plot_losses(train_losses, validation_losses, sampler_name)
        self._print_stats(train_losses, validation_losses, training_time,
                          sampler_name)

Пример #21

Показать файл

Файл: FileParser.py Проект: andreeaiana/graph_confrec

class FileParser:

    path_raw = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "..", "..", "data", "raw")
    path_persistent = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   "..", "..", "data", "interim", "parsed_data"
                                   )

    def __init__(self):
        self.timer = Timer()
        self.persistent = {}
        self.processes = {
                # Old datasets
                "old_books": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "old_books.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "old_books_new_books": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books_new_books",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "old_books_new_books.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "old_books_conferences": {
                        "filename": os.path.join(self.path_raw,
                                                 old_books_file),
                        "process_line": "_process_line_old_books_conferences",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "old_books_conferences.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "conferences.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "conferences_name": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_name",
                        "persistent_file": os.path.join(
                                self.path_persistent, "conferences_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_acronym": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_acronym",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_acronym.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_city": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_city",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_city.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_country": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_country",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_country.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_year": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_year",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_year.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_datestart": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_datestart",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_datestart.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_dateend": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_dateend",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_dateend.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferences_conferenceseries": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferences_conferenceseries",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferences_conferenceseries.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },
                "conferenceseries": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferenceseries",
                        "persistent_file": os.path.join(
                                self.path_persistent, "conferenceseries.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "ntriples"
                        },
                "conferenceseries_name": {
                        "filename": os.path.join(self.path_raw,
                                                 old_conferences_file),
                        "process_line": "_process_line_conferenceseries_name",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "conferenceseries_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "ntriples"
                        },

                # New datasets
                "books": {
                        "filename": os.path.join(self.path_raw, books_file),
                        "process_line": "_process_line_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "books.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "json"
                        },
                "isbn_books": {
                        "filename": os.path.join(self.path_raw, books_file),
                        "process_line": "_process_line_isbn_books",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "isbn_books.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "authors_name": {
                        "filename": os.path.join(self.path_raw, authors_file),
                        "process_line": "_process_line_authors_name",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "authors_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters.pkl"),
                        "persistent_variable": [],
                        "dataset_format": "json"
                        },
                "chapters_title": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_title",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters_title.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_year": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_year",
                        "persistent_file": os.path.join(self.path_persistent,
                                                        "chapters_year.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_language": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_language",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_language.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_abstract": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_abstract",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_abstract.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_authors": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_authors",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_authors.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_authors_name": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_authors_name",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_authors_name.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_all_citations": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_all_citations",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_all_citations.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_keywords": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_keywords",
                        "persistent_file": os.path.join(
                                self.path_persistent, "chapters_keywords.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                "chapters_books_isbns": {
                        "filename": os.path.join(self.path_raw, chapters_file),
                        "process_line": "_process_line_chapters_books_isbns",
                        "persistent_file": os.path.join(
                                self.path_persistent,
                                "chapters_books_isbns.pkl"),
                        "persistent_variable": {},
                        "dataset_format": "json"
                        },
                }

    def get_data(self, process):
        # Check if the data is already present
        if (process in self.persistent):
            return self.persistent[process]

        print("Process '{}' not in memory yet.".format(process))

        # Load from persistent file if data already processed
        if os.path.isfile(self.processes[process]["persistent_file"]):
            with open(self.processes[process]["persistent_file"],
                      "rb") as f:
                self.persistent[process] = pickle.load(f)
                return self.persistent[process]

        print("Process '{}' not persistent yet. Processing.".format(
                process))

        # Process the raw data
        self.persistent[process] = self.processes[process][
                "persistent_variable"]
        self._parse_file(
                self.processes[process]["filename"],
                self.processes[process]["process_line"],
                self.persistent[process],
                self.processes[process]["dataset_format"]
                )
        with open(self.processes[process]["persistent_file"], "wb") as f:
            pickle.dump(self.persistent[process], f)

        return self.persistent[process]

    def _parse_file(self, filename, process_line, results, dataset_format):
        if dataset_format == "json":
            self._process_json_file(filename, process_line, results)
        else:
            self._process_ntriples_file(filename, process_line, results)

    def _process_json_file(self, filename, process_line, results):
        print("Computing number of json files.")
        with tarfile.open(filename, "r:gz", encoding="utf-8") as tar:
            count_files = len(tar.getnames())
        print("Finished computing number of files: {}.\n".format(
                count_files))

        print("Start processing file.\n")
        self.timer.tic()
        process_line_function = self.__getattribute__(process_line)
        with tqdm(desc="Processing files: ", total=count_files,
                  unit="file") as pbar:
            with tarfile.open(filename, "r:gz", encoding="utf-8") as tar:
                for member in tar.getmembers():
                    if "jsonl" in member.name:
                        file = tar.extractfile(member)
                        content = [json.loads(line) for line in
                                   file.readlines()]
                        for line in content:
                            process_line_function(line, results)
                    pbar.update(1)
        self.timer.toc()
        print("Finished processing file.\n\n")

    def _process_ntriples_file(self, filename, process_line, results):
        print("Computing file size.")
        with gzip.open(filename, mode="rt", encoding="utf-8") as f:
            file_size = f.seek(0, io.SEEK_END)
        print("Finished computing file size: {} bytes.\n".format(
                file_size))

        print("Start processing file.\n")
        self.timer.tic()
        process_line_function = self.__getattribute__(process_line)
        with tqdm(desc="Processing file: ", total=file_size,
                  unit="bytes") as pbar:
            with gzip.open(filename, mode="rt", encoding="utf-8") as f:
                for line in f:
                    process_line_function(line, results)
                    pbar.update(len(line))
        self.timer.toc()
        print("Finished processing file.\n\n")

    # Processes implementations
    def _process_line_old_books(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_has_conference:
            if line[0].startswith(nt_book):
                if line[0] not in results:
                    results.append(line[0])

    def _process_line_old_books_new_books(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_webpage:
            if line[0].startswith(nt_book):
                if line[0] in self.get_data("old_books"):
                    new_book_id = "sg:pub." + line[2].split(
                            ".com/")[-1].rsplit(">")[0]
                    results[line[0]] = new_book_id

    def _process_line_old_books_conferences(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_has_conference:
            if line[0].startswith(nt_book):
                results[line[0]] = line[2]

    def _process_line_conferences(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[0].startswith(nt_conferences):
            if line[0] not in results:
                results.append(line[0])

    def _process_line_conferences_name(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_name:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_acronym(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_acronym:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_city(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_city:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_country(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_country:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_year(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_year:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_datestart(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_datestart:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_dateend(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_dateend:
            if line[0].startswith(nt_conferences):
                results[line[0]] = line[2]

    def _process_line_conferences_conferenceseries(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_has_conference_series:
            results[line[0]] = line[2]

    def _process_line_conferenceseries(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[0].startswith(nt_conference_series):
            if line[0] not in results:
                results.append(line[0])

    def _process_line_conferenceseries_name(self, line, results):
        line = line.rstrip(" .\n").split(maxsplit=2)
        if line[1] == nt_name:
            if line[0].startswith(nt_conference_series):
                results[line[0]] = line[2]

    def _process_line_books(self, line, results):
        new_books = list(self.get_data("old_books_new_books").values())
        if line["id"] not in results:
            if line["id"] in new_books:
                results.append(line["id"])

    def _process_line_isbn_books(self, line, results):
        if "isbn" in line.keys():
            if line["id"] in self.get_data("books"):
                isbn_list = line["isbn"]
                for isbn in isbn_list:
                    results[isbn] = line["id"]

    def _process_line_authors_name(self, line, results):
        family_name = line["familyName"] if "familyName" in line.keys() else ""
        given_name = line["givenName"] if "givenName" in line.keys() else ""
        if not family_name == "Not available":
            author_name = family_name + " " + given_name
        else:
            author_name = ""
        results[line["id"]] = author_name

    def _process_line_chapters(self, line, results):
        if "isPartOf" in line.keys():
            if line["id"] not in results:
                book = line["isPartOf"]
                if "isbn" in book.keys():
                    isbn_list = book["isbn"]
                    for isbn in isbn_list:
                        if isbn in self.get_data("isbn_books"):
                            results.append(line["id"])

    def _process_line_chapters_title(self, line, results):
        if "name" in line.keys():
            if line["id"] in self.get_data("chapters"):
                results[line["id"]] = line["name"]

    def _process_line_chapters_year(self, line, results):
        if "datePublished" in line.keys():
            if line["id"] in self.get_data("chapters"):
                year = line["datePublished"].split("-")[0]
                results[line["id"]] = year

    def _process_line_chapters_language(self, line, results):
        if "inLanguage" in line.keys():
            if line["id"] in self.get_data("chapters"):
                results[line["id"]] = line["inLanguage"][0]

    def _process_line_chapters_abstract(self, line, results):
        if "description" in line.keys():
            if line["id"] in self.get_data("chapters"):
                results[line["id"]] = line["description"]

    def _process_line_chapters_authors(self, line, results):
        if "author" in line.keys():
            if line["id"] in self.get_data("chapters"):
                authors = line["author"]
                authors_id = [authors[i]["id"] for i in
                              range(len(authors)) if "id" in
                              authors[i].keys()]
                results[line["id"]] = authors_id

    def _process_line_chapters_authors_name(self, line, results):
        if "author" in line.keys():
            if line["id"] in self.get_data("chapters"):
                authors = line["author"]
                author_names = list()
                for i in range(len(authors)):
                    family_name = authors[i]["familyName"] if \
                        "familyName" in authors[i].keys() else ""
                    given_name = authors[i]["givenName"] if "givenName" \
                        in authors[i].keys() else ""
                    author_names.append(family_name + " " + given_name)
                results[line["id"]] = author_names

    def _process_line_chapters_all_citations(self, line, results):
        if "citation" in line.keys():
            if line["id"] in self.get_data("chapters"):
                citations = line["citation"]
                citations_id = [citations[i]["id"] for i in range(
                        len(citations))]
                results[line["id"]] = citations_id

    def _process_line_chapters_keywords(self, line, results):
        if "keywords" in line.keys():
            if line["id"] in self.get_data("chapters"):
                results[line["id"]] = line["keywords"]

    def _process_line_chapters_books_isbns(self, line, results):
        if "isPartOf" in line.keys():
            if line["id"] in self.get_data("chapters"):
                book = line["isPartOf"]
                if "isbn" in book.keys():
                    isbn_list = book["isbn"]
                    results[line["id"]] = isbn_list