def __init__(self, data_frame, text_col, clean_function=lambda x: x, nlp=None, feats_from_spacy_doc=None, verbose=False): '''Creates a TermDocMatrix from a pandas data frame. Parameters ---------- data_frame : pd.DataFrame The data frame that contains columns for the category of interest and the document text. text_col : str The name of the column which contains each document's raw text. clean_function : function, optional A function that strips invalid characters out of the document text string, returning the new string. nlp : function, optional feats_from_spacy_doc : FeatsFromSpacyDoc or None verbose : boolean, optional If true, prints a message every time a document index % 100 is 0. See Also -------- TermDocMatrixFactory ''' TermDocMatrixFactory.__init__(self, clean_function=clean_function, nlp=nlp, feats_from_spacy_doc=feats_from_spacy_doc) self.data_frame = data_frame.reset_index() self._text_col = text_col self._verbose = verbose
def build_term_doc_matrix(): term_doc_matrix = TermDocMatrixFactory( category_text_iter=iter_party_speech_pairs(), clean_function=clean_function_factory(), nlp=whitespace_nlp ).build() return term_doc_matrix