示例#1
0
	def __init__(self,
	             X,
	             mX,
	             y,
	             term_idx_store,
	             category_idx_store,
	             metadata_idx_store,
	             raw_texts,
	             unigram_frequency_path=None):
		'''
		Parameters
		----------
		X : csr_matrix
			term document matrix
		mX : csr_matrix
			metadata-document matrix
		y : np.array
			category index array
		term_idx_store : IndexStore
			Term indices
		category_idx_store : IndexStore
			Catgory indices
		metadata_idx_store : IndexStore
		  Document metadata indices
		raw_texts : np.array or pd.Series
			Raw texts
		unigram_frequency_path : str or None
			Path to term frequency file.
		'''
		TermDocMatrix.__init__(self, X, mX, y,
		                       term_idx_store,
		                       category_idx_store,
		                       metadata_idx_store,
		                       unigram_frequency_path)
		self._raw_texts = raw_texts
示例#2
0
    def __init__(self,
                 X,
                 mX,
                 y,
                 term_idx_store,
                 category_idx_store,
                 metadata_idx_store,
                 raw_texts,
                 unigram_frequency_path=None):
        '''
		Parameters
		----------
		X : csr_matrix
			term document matrix
		mX : csr_matrix
			metadata-document matrix
		y : np.array
			category index array
		term_idx_store : IndexStore
			Term indices
		category_idx_store : IndexStore
			Catgory indices
		metadata_idx_store : IndexStore
		  Document metadata indices
		raw_texts : np.array or pd.Series
			Raw texts
		unigram_frequency_path : str or None
			Path to term frequency file.
		'''
        TermDocMatrix.__init__(self, X, mX, y, term_idx_store,
                               category_idx_store, metadata_idx_store,
                               unigram_frequency_path)
        self._raw_texts = raw_texts
示例#3
0
def build_from_category_whitespace_delimited_text(category_text_iter):
    '''

    Parameters
    ----------
    category_text_iter iterator of (string category name, one line per sentence, whitespace-delimited text) pairs

    Returns
    -------
    TermDocMatrix
    '''
    y = []
    X_factory = CSRMatrixFactory()
    term_idx_store = IndexStore()
    category_idx_store = IndexStore()
    mX_factory = CSRMatrixFactory()
    for doci, (category, text) in enumerate(category_text_iter):
        y.append(category_idx_store.getidx(category))
        term_freq = Counter()
        for sent in text.strip(string.punctuation).lower().split('\n'):
            unigrams = []
            for tok in sent.strip().split():
                unigrams.append(tok)
            bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:])))
            for term in unigrams + bigrams:
                term_freq[term_idx_store.getidx(term)] += 1
        for word_idx, freq in term_freq.items():
            X_factory[doci, word_idx] = freq
    metadata_idx_store = IndexStore()
    return TermDocMatrix(X=X_factory.get_csr_matrix(),
                         mX=mX_factory.get_csr_matrix(),
                         y=np.array(y),
                         term_idx_store=term_idx_store,
                         metadata_idx_store=metadata_idx_store,
                         category_idx_store=category_idx_store)
    def build(self):
        '''
		Returns
		-------
		TermDocMatrix
		'''
        constructor_kwargs = self._get_build_kwargs()
        return TermDocMatrix(**constructor_kwargs)
 def _apply_pipeline_and_get_build_instance(self, X_factory, mX_factory,
                                            category_idx_store, df,
                                            parse_pipeline, term_idx_store,
                                            metadata_idx_store, y):
     df.apply(parse_pipeline.parse, axis=1)
     y = np.array(y)
     X, mX = self._build_sparse_matrices(y, X_factory, mX_factory)
     tdm = TermDocMatrix(X, mX, y, term_idx_store, category_idx_store,
                         metadata_idx_store)
     return tdm
示例#6
0
 def build(self):
     '''
     Returns
     -------
     CorpusDF
     '''
     if self.text_df is not None:
         if self.parsed_col is not None:
             if self.category_col is None:
                 self.text_df = self.text_df.assign(
                     Category=self.category_idx_store.getvalbatch(self.y))
                 self.category_col = 'Category'
             return ParsedCorpus(
                 df=self.text_df,
                 X=self.X,
                 mX=self.mX,
                 y=self.y,
                 parsed_col=self.parsed_col,
                 term_idx_store=self.term_idx_store,
                 category_idx_store=self.category_idx_store,
                 metadata_idx_store=self.metadata_idx_store,
                 unigram_frequency_path=self.unigram_frequency_path,
                 category_col=self.category_col)
         elif self.text_col is not None:
             return CorpusDF(
                 df=self.text_df,
                 X=self.X,
                 mX=self.mX,
                 y=self.y,
                 text_col=self.text_col,
                 term_idx_store=self.term_idx_store,
                 category_idx_store=self.category_idx_store,
                 metadata_idx_store=self.metadata_idx_store,
                 unigram_frequency_path=self.unigram_frequency_path)
     return TermDocMatrix(
         X=self.X,
         mX=self.mX,
         y=self.y,
         term_idx_store=self.term_idx_store,
         category_idx_store=self.category_idx_store,
         metadata_idx_store=self.metadata_idx_store,
         unigram_frequency_path=self.unigram_frequency_path)
示例#7
0
    def _build_from_category_spacy_doc_iter(self, category_doc_iter):
        '''
		Parameters
		----------
		category_doc_iter : iterator of (string category name, spacy.tokens.doc.Doc) pairs

		Returns
		----------
		t : TermDocMatrix
		'''
        term_idx_store = IndexStore()
        category_idx_store = IndexStore()
        metadata_idx_store = IndexStore()
        X, mX, y = self._get_features_and_labels_from_documents_and_indexes(
            category_doc_iter, category_idx_store, term_idx_store,
            metadata_idx_store)
        return TermDocMatrix(X,
                             mX,
                             y,
                             term_idx_store=term_idx_store,
                             category_idx_store=category_idx_store,
                             metadata_idx_store=metadata_idx_store)