Пример #1
0
    def _list_documents(self, source, extension=".xml", save=False):
        """
        List all patent documents within corpus.
        """
        if not os.path.exists(source):
            raise PathNotFoundError("%s: Source does not exist!" %
                                    source.rsplit(os.sep, 1)[1])

        documents = []
        test_documents = []

        for root, folders, files in os.walk(source):
            for file in files:
                if not file.startswith('.'):
                    if file.endswith(extension):
                        if self.enable_sampling:
                            if random.random() <= self.train_ratio:
                                documents.append(os.path.join(root, file))
                            else:
                                test_documents.append(os.path.join(root, file))
                        else:
                            documents.append(os.path.join(root, file))

        if save:
            if not os.path.exists(SAMPLED_DATA_PATH):
                raise PathNotFoundError(
                    "Sampled data path does not exist: %s" % SAMPLED_DATA_PATH)

            with open(TRAINING_DATA, "w") as f:
                f.write("\n".join(documents))

            with open(TESTING_DATA, "w") as f:
                f.write("\n".join(test_documents))

        return (documents, test_documents)
Пример #2
0
    def load(self, model):
        """
        Load Patent2Vec model.
        """
        log.info("Loading Patent2Vec model")

        if not os.path.exists(model):
            raise PathNotFoundError("Patent2Vec model does not exist: %s"
                                    % model)

        self.model = Doc2Vec.load(model)
Пример #3
0
    def save_document_embeddings(self,
                                 document_embeddings=None,
                                 rows=None,
                                 columns=500,
                                 database=None,
                                 table_name=None,
                                 save_patent_category=True,
                                 prepend_document_category=False):
        """
        Save document embeddings to database.
        """
        log.info("Saving document embeddings")

        if document_embeddings is None:
            document_embeddings = PATENT_EMBEDDING

        if not os.path.exists(document_embeddings):
            raise PathNotFoundError("Path does not exist: %s"
                                    % document_embeddings)

        if rows is None:
            raise ValueError("'rows' not defined!")

        if database is None:
            raise ValueError("'database' not defined!")

        if table_name is None:
            raise ValueError("'table_name' not defined!")

        # Create a memory map with document embeddings for reducing load on RAM
        embeddings = np.memmap(document_embeddings,
                               dtype='float32',
                               mode='r',
                               shape=(rows, columns))

        # Insert document embedding records into database
        for i, embedding in enumerate(embeddings):
            doctag = self.model.docvecs.index_to_doctag(i)

            patent_name = self._get_document_label(doctag,
                                                   prepend_document_category)
            embedding = " ".join(map(str, embedding))
            if save_patent_category:
                patent_category = self._get_document_category(doctag)
            else:
                patent_category = "UNKNOWN"

            record = [("PatentName", patent_name),
                      ("DocumentEmbedding", embedding),
                      ("PatentCategory", patent_category)]

            database.insert(table=table_name, record=record)
Пример #4
0
    def _load_conceptualizer(self, model, log_every=1000000):
        """
        Load conceptualizer model.
        """
        if not os.path.exists(model):
            raise PathNotFoundError("%s: Conceptualizer does not exist!" %
                                    model.rsplit(os.sep, 1)[1])

        # Create conceptualizer's object
        self.conceptualizer = Conceptualizer()

        # Load conceptualizer model
        self.conceptualizer.load(model_path=model, log_every=log_every)
Пример #5
0
    def save(self, model=None, path=None):
        """
        Save Patent2Vec model.
        """
        log.info("Saving Patent2Vec model")

        if model is None:
            model = PATENT2VEC_MODEL.rsplit(os.sep, 1)[1]

        if path is None:
            path = PATENT2VEC_MODEL_PATH

        if not os.path.exists(path):
            raise PathNotFoundError("Path does not exist: %s" % path)

        self.model.save(os.path.join(path, model))
Пример #6
0
    def standardize_embeddings(self, document_embeddings, rows, columns):
        """
        Standardize document embeddings.
        """
        path = STANDARDIZED_PATENT_EMBEDDING.rsplit(os.sep, 1)[0]

        if not os.path.exists(path):
            raise PathNotFoundError("Path does not exist: %s" % path)

        standardized_patent_embeddings = np.memmap(STANDARDIZED_PATENT_EMBEDDING,
                                                   dtype='float32',
                                                   mode='w+',
                                                   shape=(rows, columns))

        standardized_patent_embeddings[:] = np.array(zscore(document_embeddings))[:]

        return standardized_patent_embeddings
Пример #7
0
    def evaluate(self):
        """
        Evaluate Patent2Vec model.
        """
        log.info("Evaluating Patent2Vec model")

        if not os.path.exists(WORD2VEC_BENCHMARK_DATA):
            raise PathNotFoundError("%s: Evaluation dataset does not exist!"
                                    % WORD2VEC_BENCHMARK_DATA.rsplit(os.sep, 1)[1])

        # Evaluate Patent2Vec model
        accuracy = self.model.accuracy(WORD2VEC_BENCHMARK_DATA)

        # Find correct and incorrect predictions
        correct = len(accuracy[-1]['correct'])
        incorrect = len(accuracy[-1]['incorrect'])
        total = correct + incorrect

        # Calculate correct and incorrect predictions' percentage
        percentage = lambda x: (x / total) * 100

        log.info("Total: %d,  Correct: %0.2f%%,  Incorrect: %0.2f%%",
                 total, percentage(correct), percentage(incorrect))
Пример #8
0
    def get_preprocessed_document(self, document):
        """
        Get preprocessed patent document.
        """
        if not os.path.exists(document):
            raise PathNotFoundError("%s: Document does not exist!" %
                                    document.rsplit(os.sep, 1)[1])

        for source_encoding in PatentDocument.SOURCE_ENCODING:
            with codecs.open(document, "r", source_encoding) as d:
                try:
                    content = self._read(d)
                except UnicodeDecodeError as e:
                    continue

                preprocessed_content = self._preprocess(content,
                                                        lowercase=True)

                if self.token_only:
                    return preprocessed_content
                else:
                    return TaggedDocument(preprocessed_content, [document])

        return None
Пример #9
0
    def save_document_embeddings(self,
                                 document_embeddings=None,
                                 doc_labels=None,
                                 doc_categories=None,
                                 rows=None,
                                 columns=500,
                                 database=None,
                                 table_name=None,
                                 save_patent_category=True):
        """
        Save document embeddings to database.
        """
        log.info("Saving document embeddings")

        if document_embeddings is None:
            document_embeddings = PATENT_EMBEDDING

        if doc_labels is None:
            doc_labels = PATENT_LABEL

        if doc_categories is None:
            doc_categories = PATENT_CATEGORY

        if not os.path.exists(document_embeddings):
            raise PathNotFoundError("Path does not exist: %s"
                                    % document_embeddings)

        if not os.path.exists(doc_labels):
            raise PathNotFoundError("Path does not exist: %s"
                                    % doc_labels)

        if not os.path.exists(doc_categories):
            raise PathNotFoundError("Path does not exist: %s"
                                    % doc_categories)

        if rows is None:
            rows = self.total_docs

        if columns is None:
            columns = self.embedding_size

        if database is None:
            raise ValueError("'database' not defined!")

        if table_name is None:
            raise ValueError("'table_name' not defined!")

        # Create a memory map with document embeddings for reducing load on RAM
        embeddings = np.memmap(document_embeddings,
                               dtype='float32',
                               mode='r',
                               shape=(rows, columns))

        # Create a memory map with document labels for reducing load on RAM
        labels = np.memmap(doc_labels,
                           dtype="object",
                           mode='r',
                           shape=(rows,))

        # Create a memory map with document categories for reducing load on RAM
        categories = np.memmap(doc_categories,
                               dtype="object",
                               mode='r',
                               shape=(rows,))

        # Insert document embedding records into database
        for i, embedding in enumerate(embeddings):
            patent_name = labels[i]
            embedding = " ".join(map(str, embedding))
            if save_patent_category:
                patent_category = self._get_document_category(categories[i])
            else:
                patent_category = "UNKNOWN"

            record = [("PatentName", patent_name),
                      ("DocumentEmbedding", embedding),
                      ("PatentCategory", patent_category)]

            db.insert(table=table_name, record=record)
Пример #10
0
    def train(self, patents):
        """
        Generates document embedding for a patent document.
        """
        log.info("Training AvgPatent2Vec model")

        if not os.path.exists(PATENT_EMBEDDING.rsplit(os.sep, 1)[0]):
            raise PathNotFoundError("Path does not exist: %s"
                                    % PATENT_EMBEDDING.rsplit(os.sep, 1)[0])

        if not os.path.exists(PATENT_LABEL.rsplit(os.sep, 1)[0]):
            raise PathNotFoundError("Path does not exist: %s"
                                    % PATENT_LABEL.rsplit(os.sep, 1)[0])

        if not os.path.exists(PATENT_CATEGORY.rsplit(os.sep, 1)[0]):
            raise PathNotFoundError("Path does not exist: %s"
                                    % PATENT_CATEGORY.rsplit(os.sep, 1)[0])

        self.total_docs = len(patents)

        doc_embeddings = np.memmap(PATENT_EMBEDDING,
                                   dtype='float32',
                                   mode='w+',
                                   shape=(self.total_docs, self.embedding_size))

        doc_labels = np.memmap(PATENT_LABEL,
                               dtype="object",
                               mode='w+',
                               shape=(self.total_docs,))

        doc_categories = np.memmap(PATENT_CATEGORY,
                                   dtype="object",
                                   mode='w+',
                                   shape=(self.total_docs,))

        # Find document embedding by averaging token embeddings
        # for all tokens within a document
        for i, patent in enumerate(patents):
            word_embeddings = patent.words[1]

            word_count = 0
            doc_embedding = np.zeros((500,), dtype=np.float32)
            for word_embedding in word_embeddings:
                if isinstance(word_embedding, np.ndarray):
                    doc_embedding += word_embedding
                    word_count += 1

            if word_count != 0:
                doc_embedding /= word_count

            # Document embedding
            doc_embeddings[i] = doc_embedding

            # Document label
            doc_label = patent.tags[0]
            doc_label = doc_label.rsplit(os.sep, 1)[1]
            doc_label = doc_label.rsplit('.', 1)[0]
            doc_labels[i] = doc_label

            # Document category
            doc_category = patent.tags[0]
            doc_category = doc_category.rsplit(os.sep, 2)[1]
            doc_categories[i] = doc_category
Пример #11
0
def main():
    log.info("*****Patent2Vec Application*****")

    # Preprocess patent documents
    log.info("Preprocessing patent documents")
    patents = PatentDocument(SOURCE_DATASET,
                             extension="",
                             use_conceptualizer=True,
                             transform_conceptualizer=True,
                             enable_sampling=True,
                             train_ratio=1.0,
                             test_ratio=0.0)

    # Create Patent2Vec model
    models = OrderedDict()

    # PV-DM with average
    models["PV_DM_Mean"] = \
        Patent2Vec(dm=1, dm_mean=1, dm_concat=0, min_word_count=5, size=500,
                   context_window_size=8, negative=2, iter=50, workers=CPU_CORE,
                   use_less_memory=False, docvecs_mapfile=DOCVECS_MAP)
    models["PV_DM_Mean"].build(patents)
    models["PV_DM_Mean"].intersect_with_pretrained_embedding(
        PRETRAINED_EMBEDDING, binary=False)
    #     models["PV_DM_Mean"].load(PATENT2VEC_MODEL)

    #     # PV-DM with concatenation
    #     models["PV_DM_Concatenation"] = \
    #         Patent2Vec(dm=1, dm_mean=0, dm_concat=1, min_word_count=5, size=500,
    #                    context_window_size=8, negative=2, iter=50, workers=CPU_CORE,
    #                    use_less_memory=False, docvecs_mapfile=DOCVECS_MAP)
    #     models["PV_DM_Concatenation"].reuse_from(models["PV_DM_Mean"])
    # #     models["PV_DM_Concatenation"].build(patents)
    # #     models["PV_DM_Concatenation"].intersect_with_pretrained_embedding(PRETRAINED_EMBEDDING,
    # #                                                                       binary=False)
    # # #     models["PV_DM_Concatenation"].load(PATENT2VEC_MODEL)

    #     # PV-DBOW
    #     models["PV_DBOW"] = \
    #         Patent2Vec(dm=0, dm_mean=0, dm_concat=0, min_word_count=5, size=500,
    #                    context_window_size=8, negative=2, iter=50, workers=CPU_CORE,
    #                    use_less_memory=False, docvecs_mapfile=DOCVECS_MAP)
    #     models["PV_DBOW"].reuse_from(models["PV_DM_Mean"])
    # #     models["PV_DBOW"].build(patents)
    # #     models["PV_DBOW"].intersect_with_pretrained_embedding(PRETRAINED_EMBEDDING,
    # #                                                           binary=False)
    # # #     models["PV_DBOW"].load(PATENT2VEC_MODEL)

    #     # Mixed models
    #     models["DBOW + DM with average"] = ConcatenatedPatent2Vec([models["PV_DBOW"],
    #                                                                models["PV_DM_Mean"]])
    #     models["DBOW + DM with concatenation"] = ConcatenatedPatent2Vec([models["PV_DBOW"],
    #                                                                      models["PV_DM_Concatenation"]])

    for name, model in models.items():
        # Train Patent2Vec model
        start_time = time.time()
        model.train(patents,
                    alpha=0.1,
                    min_alpha=0.0001,
                    passes=10,
                    fixed_alpha=False)
        end_time = time.time()
        log.info("Total time elapsed: %r", (end_time - start_time))

        # Evaluate Patent2Vec model
        model.evaluate()

        # Save Patent2Vec model
        model.save(model=PATENT2VEC_MODEL)

        # Create a database object
        db = Database(verbose=True)

        # Connect to database
        db.connect(in_memory=True)

        # Create a new table for storing document embeddings
        db.create_table(table=PATENT_EMBEDDING_TABLE,
                        primary_column=PRIMARY_KEY,
                        other_columns=FIELDS)

        # Save document embeddings
        model.save_document_embeddings(document_embeddings=PATENT_EMBEDDING,
                                       rows=len(patents),
                                       columns=500,
                                       database=db,
                                       table_name=PATENT_EMBEDDING_TABLE,
                                       save_patent_category=True,
                                       prepend_document_category=True)

        # Test documents
        if not os.path.exists(TESTING_DATA):
            raise PathNotFoundError("Path does not exist: %s" % TESTING_DATA)

        with open(TESTING_DATA, "r") as t:
            test_documents = t.readlines()
            test_documents = map(lambda x: x.strip(), test_documents)
            test_documents = filter(None, test_documents)

        # Preprocessed test documents
        preprocessed_test_documents = patents.get_preprocessed_corpus(
            test_documents)

        # Predict document embeddings
        model.predict(preprocessed_test_documents,
                      alpha=0.1,
                      min_alpha=0.0001,
                      steps=50,
                      save=True,
                      database=db,
                      table_name=PATENT_EMBEDDING_TABLE,
                      save_patent_category=True,
                      prepend_document_category=True)

        # Create an index on document embedding table
        db.create_index(index=PATENT_EMBEDDING_INDEX,
                        table=PATENT_EMBEDDING_TABLE,
                        index_by_column=PRIMARY_KEY[0])

        # Close database connection
        db.close(save_to=PATENT_EMBEDDING_DATABASE)

        # Delete temporary training data
        model.clean()

    # Test document for checking the quality of Patent2Vec model
    patents.set_token_only(True)
    preprocessed_test_document = patents.get_preprocessed_document(
        TEST_DOCUMENT)
    patents.set_token_only(False)

    # Check quality of Patent2Vec model
    if preprocessed_test_document is not None:
        log.info("Check quality of Patent2Vec model")
        log.info("Top matches for test document: %s", TEST_DOCUMENT)

        for name, model in models.items():
            embedding = model.infer(preprocessed_test_document)

            top_matches = model.model.docvecs.most_similar(
                positive=[embedding], negative=[], topn=10)
            top_matches = map(lambda x: x[0] + "\t\t" + str(x[1]), top_matches)

            for top_match in top_matches:
                log.info(top_match)

    # Clean all un-necessary files
    clean(cleanSample=True,
          cleanModel=False,
          cleanDocvecs=True,
          cleanDatabase=False,
          cleanClusters=False,
          filter=[])