def _list_documents(self, source, extension=".xml", save=False): """ List all patent documents within corpus. """ if not os.path.exists(source): raise PathNotFoundError("%s: Source does not exist!" % source.rsplit(os.sep, 1)[1]) documents = [] test_documents = [] for root, folders, files in os.walk(source): for file in files: if not file.startswith('.'): if file.endswith(extension): if self.enable_sampling: if random.random() <= self.train_ratio: documents.append(os.path.join(root, file)) else: test_documents.append(os.path.join(root, file)) else: documents.append(os.path.join(root, file)) if save: if not os.path.exists(SAMPLED_DATA_PATH): raise PathNotFoundError( "Sampled data path does not exist: %s" % SAMPLED_DATA_PATH) with open(TRAINING_DATA, "w") as f: f.write("\n".join(documents)) with open(TESTING_DATA, "w") as f: f.write("\n".join(test_documents)) return (documents, test_documents)
def load(self, model): """ Load Patent2Vec model. """ log.info("Loading Patent2Vec model") if not os.path.exists(model): raise PathNotFoundError("Patent2Vec model does not exist: %s" % model) self.model = Doc2Vec.load(model)
def save_document_embeddings(self, document_embeddings=None, rows=None, columns=500, database=None, table_name=None, save_patent_category=True, prepend_document_category=False): """ Save document embeddings to database. """ log.info("Saving document embeddings") if document_embeddings is None: document_embeddings = PATENT_EMBEDDING if not os.path.exists(document_embeddings): raise PathNotFoundError("Path does not exist: %s" % document_embeddings) if rows is None: raise ValueError("'rows' not defined!") if database is None: raise ValueError("'database' not defined!") if table_name is None: raise ValueError("'table_name' not defined!") # Create a memory map with document embeddings for reducing load on RAM embeddings = np.memmap(document_embeddings, dtype='float32', mode='r', shape=(rows, columns)) # Insert document embedding records into database for i, embedding in enumerate(embeddings): doctag = self.model.docvecs.index_to_doctag(i) patent_name = self._get_document_label(doctag, prepend_document_category) embedding = " ".join(map(str, embedding)) if save_patent_category: patent_category = self._get_document_category(doctag) else: patent_category = "UNKNOWN" record = [("PatentName", patent_name), ("DocumentEmbedding", embedding), ("PatentCategory", patent_category)] database.insert(table=table_name, record=record)
def _load_conceptualizer(self, model, log_every=1000000): """ Load conceptualizer model. """ if not os.path.exists(model): raise PathNotFoundError("%s: Conceptualizer does not exist!" % model.rsplit(os.sep, 1)[1]) # Create conceptualizer's object self.conceptualizer = Conceptualizer() # Load conceptualizer model self.conceptualizer.load(model_path=model, log_every=log_every)
def save(self, model=None, path=None): """ Save Patent2Vec model. """ log.info("Saving Patent2Vec model") if model is None: model = PATENT2VEC_MODEL.rsplit(os.sep, 1)[1] if path is None: path = PATENT2VEC_MODEL_PATH if not os.path.exists(path): raise PathNotFoundError("Path does not exist: %s" % path) self.model.save(os.path.join(path, model))
def standardize_embeddings(self, document_embeddings, rows, columns): """ Standardize document embeddings. """ path = STANDARDIZED_PATENT_EMBEDDING.rsplit(os.sep, 1)[0] if not os.path.exists(path): raise PathNotFoundError("Path does not exist: %s" % path) standardized_patent_embeddings = np.memmap(STANDARDIZED_PATENT_EMBEDDING, dtype='float32', mode='w+', shape=(rows, columns)) standardized_patent_embeddings[:] = np.array(zscore(document_embeddings))[:] return standardized_patent_embeddings
def evaluate(self): """ Evaluate Patent2Vec model. """ log.info("Evaluating Patent2Vec model") if not os.path.exists(WORD2VEC_BENCHMARK_DATA): raise PathNotFoundError("%s: Evaluation dataset does not exist!" % WORD2VEC_BENCHMARK_DATA.rsplit(os.sep, 1)[1]) # Evaluate Patent2Vec model accuracy = self.model.accuracy(WORD2VEC_BENCHMARK_DATA) # Find correct and incorrect predictions correct = len(accuracy[-1]['correct']) incorrect = len(accuracy[-1]['incorrect']) total = correct + incorrect # Calculate correct and incorrect predictions' percentage percentage = lambda x: (x / total) * 100 log.info("Total: %d, Correct: %0.2f%%, Incorrect: %0.2f%%", total, percentage(correct), percentage(incorrect))
def get_preprocessed_document(self, document): """ Get preprocessed patent document. """ if not os.path.exists(document): raise PathNotFoundError("%s: Document does not exist!" % document.rsplit(os.sep, 1)[1]) for source_encoding in PatentDocument.SOURCE_ENCODING: with codecs.open(document, "r", source_encoding) as d: try: content = self._read(d) except UnicodeDecodeError as e: continue preprocessed_content = self._preprocess(content, lowercase=True) if self.token_only: return preprocessed_content else: return TaggedDocument(preprocessed_content, [document]) return None
def save_document_embeddings(self, document_embeddings=None, doc_labels=None, doc_categories=None, rows=None, columns=500, database=None, table_name=None, save_patent_category=True): """ Save document embeddings to database. """ log.info("Saving document embeddings") if document_embeddings is None: document_embeddings = PATENT_EMBEDDING if doc_labels is None: doc_labels = PATENT_LABEL if doc_categories is None: doc_categories = PATENT_CATEGORY if not os.path.exists(document_embeddings): raise PathNotFoundError("Path does not exist: %s" % document_embeddings) if not os.path.exists(doc_labels): raise PathNotFoundError("Path does not exist: %s" % doc_labels) if not os.path.exists(doc_categories): raise PathNotFoundError("Path does not exist: %s" % doc_categories) if rows is None: rows = self.total_docs if columns is None: columns = self.embedding_size if database is None: raise ValueError("'database' not defined!") if table_name is None: raise ValueError("'table_name' not defined!") # Create a memory map with document embeddings for reducing load on RAM embeddings = np.memmap(document_embeddings, dtype='float32', mode='r', shape=(rows, columns)) # Create a memory map with document labels for reducing load on RAM labels = np.memmap(doc_labels, dtype="object", mode='r', shape=(rows,)) # Create a memory map with document categories for reducing load on RAM categories = np.memmap(doc_categories, dtype="object", mode='r', shape=(rows,)) # Insert document embedding records into database for i, embedding in enumerate(embeddings): patent_name = labels[i] embedding = " ".join(map(str, embedding)) if save_patent_category: patent_category = self._get_document_category(categories[i]) else: patent_category = "UNKNOWN" record = [("PatentName", patent_name), ("DocumentEmbedding", embedding), ("PatentCategory", patent_category)] db.insert(table=table_name, record=record)
def train(self, patents): """ Generates document embedding for a patent document. """ log.info("Training AvgPatent2Vec model") if not os.path.exists(PATENT_EMBEDDING.rsplit(os.sep, 1)[0]): raise PathNotFoundError("Path does not exist: %s" % PATENT_EMBEDDING.rsplit(os.sep, 1)[0]) if not os.path.exists(PATENT_LABEL.rsplit(os.sep, 1)[0]): raise PathNotFoundError("Path does not exist: %s" % PATENT_LABEL.rsplit(os.sep, 1)[0]) if not os.path.exists(PATENT_CATEGORY.rsplit(os.sep, 1)[0]): raise PathNotFoundError("Path does not exist: %s" % PATENT_CATEGORY.rsplit(os.sep, 1)[0]) self.total_docs = len(patents) doc_embeddings = np.memmap(PATENT_EMBEDDING, dtype='float32', mode='w+', shape=(self.total_docs, self.embedding_size)) doc_labels = np.memmap(PATENT_LABEL, dtype="object", mode='w+', shape=(self.total_docs,)) doc_categories = np.memmap(PATENT_CATEGORY, dtype="object", mode='w+', shape=(self.total_docs,)) # Find document embedding by averaging token embeddings # for all tokens within a document for i, patent in enumerate(patents): word_embeddings = patent.words[1] word_count = 0 doc_embedding = np.zeros((500,), dtype=np.float32) for word_embedding in word_embeddings: if isinstance(word_embedding, np.ndarray): doc_embedding += word_embedding word_count += 1 if word_count != 0: doc_embedding /= word_count # Document embedding doc_embeddings[i] = doc_embedding # Document label doc_label = patent.tags[0] doc_label = doc_label.rsplit(os.sep, 1)[1] doc_label = doc_label.rsplit('.', 1)[0] doc_labels[i] = doc_label # Document category doc_category = patent.tags[0] doc_category = doc_category.rsplit(os.sep, 2)[1] doc_categories[i] = doc_category
def main(): log.info("*****Patent2Vec Application*****") # Preprocess patent documents log.info("Preprocessing patent documents") patents = PatentDocument(SOURCE_DATASET, extension="", use_conceptualizer=True, transform_conceptualizer=True, enable_sampling=True, train_ratio=1.0, test_ratio=0.0) # Create Patent2Vec model models = OrderedDict() # PV-DM with average models["PV_DM_Mean"] = \ Patent2Vec(dm=1, dm_mean=1, dm_concat=0, min_word_count=5, size=500, context_window_size=8, negative=2, iter=50, workers=CPU_CORE, use_less_memory=False, docvecs_mapfile=DOCVECS_MAP) models["PV_DM_Mean"].build(patents) models["PV_DM_Mean"].intersect_with_pretrained_embedding( PRETRAINED_EMBEDDING, binary=False) # models["PV_DM_Mean"].load(PATENT2VEC_MODEL) # # PV-DM with concatenation # models["PV_DM_Concatenation"] = \ # Patent2Vec(dm=1, dm_mean=0, dm_concat=1, min_word_count=5, size=500, # context_window_size=8, negative=2, iter=50, workers=CPU_CORE, # use_less_memory=False, docvecs_mapfile=DOCVECS_MAP) # models["PV_DM_Concatenation"].reuse_from(models["PV_DM_Mean"]) # # models["PV_DM_Concatenation"].build(patents) # # models["PV_DM_Concatenation"].intersect_with_pretrained_embedding(PRETRAINED_EMBEDDING, # # binary=False) # # # models["PV_DM_Concatenation"].load(PATENT2VEC_MODEL) # # PV-DBOW # models["PV_DBOW"] = \ # Patent2Vec(dm=0, dm_mean=0, dm_concat=0, min_word_count=5, size=500, # context_window_size=8, negative=2, iter=50, workers=CPU_CORE, # use_less_memory=False, docvecs_mapfile=DOCVECS_MAP) # models["PV_DBOW"].reuse_from(models["PV_DM_Mean"]) # # models["PV_DBOW"].build(patents) # # models["PV_DBOW"].intersect_with_pretrained_embedding(PRETRAINED_EMBEDDING, # # binary=False) # # # models["PV_DBOW"].load(PATENT2VEC_MODEL) # # Mixed models # models["DBOW + DM with average"] = ConcatenatedPatent2Vec([models["PV_DBOW"], # models["PV_DM_Mean"]]) # models["DBOW + DM with concatenation"] = ConcatenatedPatent2Vec([models["PV_DBOW"], # models["PV_DM_Concatenation"]]) for name, model in models.items(): # Train Patent2Vec model start_time = time.time() model.train(patents, alpha=0.1, min_alpha=0.0001, passes=10, fixed_alpha=False) end_time = time.time() log.info("Total time elapsed: %r", (end_time - start_time)) # Evaluate Patent2Vec model model.evaluate() # Save Patent2Vec model model.save(model=PATENT2VEC_MODEL) # Create a database object db = Database(verbose=True) # Connect to database db.connect(in_memory=True) # Create a new table for storing document embeddings db.create_table(table=PATENT_EMBEDDING_TABLE, primary_column=PRIMARY_KEY, other_columns=FIELDS) # Save document embeddings model.save_document_embeddings(document_embeddings=PATENT_EMBEDDING, rows=len(patents), columns=500, database=db, table_name=PATENT_EMBEDDING_TABLE, save_patent_category=True, prepend_document_category=True) # Test documents if not os.path.exists(TESTING_DATA): raise PathNotFoundError("Path does not exist: %s" % TESTING_DATA) with open(TESTING_DATA, "r") as t: test_documents = t.readlines() test_documents = map(lambda x: x.strip(), test_documents) test_documents = filter(None, test_documents) # Preprocessed test documents preprocessed_test_documents = patents.get_preprocessed_corpus( test_documents) # Predict document embeddings model.predict(preprocessed_test_documents, alpha=0.1, min_alpha=0.0001, steps=50, save=True, database=db, table_name=PATENT_EMBEDDING_TABLE, save_patent_category=True, prepend_document_category=True) # Create an index on document embedding table db.create_index(index=PATENT_EMBEDDING_INDEX, table=PATENT_EMBEDDING_TABLE, index_by_column=PRIMARY_KEY[0]) # Close database connection db.close(save_to=PATENT_EMBEDDING_DATABASE) # Delete temporary training data model.clean() # Test document for checking the quality of Patent2Vec model patents.set_token_only(True) preprocessed_test_document = patents.get_preprocessed_document( TEST_DOCUMENT) patents.set_token_only(False) # Check quality of Patent2Vec model if preprocessed_test_document is not None: log.info("Check quality of Patent2Vec model") log.info("Top matches for test document: %s", TEST_DOCUMENT) for name, model in models.items(): embedding = model.infer(preprocessed_test_document) top_matches = model.model.docvecs.most_similar( positive=[embedding], negative=[], topn=10) top_matches = map(lambda x: x[0] + "\t\t" + str(x[1]), top_matches) for top_match in top_matches: log.info(top_match) # Clean all un-necessary files clean(cleanSample=True, cleanModel=False, cleanDocvecs=True, cleanDatabase=False, cleanClusters=False, filter=[])