def convert_doc_to_vector(self, model: doc2vec.Doc2Vec, token_series: pd.Series, outpath): # Convert document tokens to numerical vector doc_mat = np.vstack([model.infer_vector(doc) for doc in token_series]) # Build dataframe with above document vector df = pd.DataFrame(data=doc_mat, index=token_series.index) df.to_pickle(outpath)
def _infer_document_embeddings(model: Doc2Vec, doc_list: List[List[str]]) -> np.ndarray: """ NOTE: Inference is not deterministic therefore representations will vary between calls Returns a 2D array with shape (num_docs, embedding_dimension) """ print('Infering document embeddings..') return np.array([model.infer_vector(doc) for doc in doc_list])
def getAllSentenceEmbeddings(model: Doc2Vec, taggedDocuments: Sequence[TaggedDocument], wordsToEmbed: Path) -> Iterable[SentenceEmbeddings]: sentenceEmbeddings = [] for word in readTargetWords(wordsToEmbed): word = word.lower() arr = [] # creating the sentence embeddings for each target word for corpusDocument in getDocumentsContaining(word, taggedDocuments): embedding = model.infer_vector(corpusDocument.sentence) arr.append((corpusDocument.corpusId, embedding)) embeddingsPerCorpus = {} for corpusId, embedding in arr: embeddingsPerCorpus.setdefault(corpusId, []).append(embedding) sentenceEmbeddings.append(SentenceEmbeddings(word, embeddingsPerCorpus)) yield from sentenceEmbeddings
def method_row_vec(method_feat: MethodFeature, source_feat: MethodFeature, pos_feat: PositionFeature, proj_feat: ProjectFeature, model: Doc2Vec): return np.concatenate([ model.infer_vector(method_feat.docs), [ source_feat.exception_id, method_feat.package_depth, method_feat.param_num ], [ pos_feat.method_top, pos_feat.method_bottom, pos_feat.class_top, pos_feat.class_bottom, pos_feat.package_top, pos_feat.package_bottom ], proj_feat.dependencies_vec, proj_feat.abstract_vec, np.zeros(shape=(1, )) # FIXME 不知道为什么要加个 0 在这里 ])
def make_abstract_vec(project_folder: str, readme_path: Optional[str], model: Doc2Vec) -> np.ndarray: tokens = abstract_to_tokens(project_folder, readme_path) return model.infer_vector(tokens)