def main(): # modeling vectorizer = CountVectorizer(analyzer=space_analyzer) vectorizer.fit(corpus) print(vectorizer.get_feature_names()) print('') print('[space vectorizer]') check(vectorizer) print('[comma vectorizer]') vectorizer.analyzer = comma_analyzer check(vectorizer) print('[space vectorizer]') vectorizer.analyzer = space_analyzer check(vectorizer) return 0
def build_matrix_count(bmt__document_list, input_type='filename', with_analyzer=False, amr_tool=None): """ :param input_type: :param bmt__document_list: :return: """ vectorizer = CountVectorizer(input=input_type, dtype=np.float64) analyzer = vectorizer.build_analyzer() def stemm(doc): stemmer = PorterStemmer() return (stemmer.stem(word) for word in analyzer(doc)) def nodes(doc): graph_str = amr_tool.amr_graph_reader(doc) graph_list = amr_tool.parse_graph(graph_str) _nodes = [] for graph in graph_list: _nodes.extend(graph.nodes) return _nodes if with_analyzer: vectorizer.analyzer = stemm else: vectorizer.analyzer = nodes term_document_matrix = vectorizer.fit_transform(bmt__document_list) vocabulary = vectorizer.vocabulary_ return term_document_matrix, vocabulary
def generate_bag_of_words(bdtm__document_list, input_type='filename'): """ :param input_type: :param bdtm__document_list: :return: """ vectorizer = CountVectorizer(input=input_type) analyzer = vectorizer.build_analyzer() def stemm(doc): stemmer = PorterStemmer() return (stemmer.stem(word) for word in analyzer(doc)) vectorizer.analyzer = stemm vectorizer.fit(bdtm__document_list) vocabulary = vectorizer.vocabulary_ return vocabulary
def generate_bag_of_words(self, generate_bow__path_list): """ :param generate_bow__path_list: :return: """ vectorizer = CountVectorizer(input='filename') analyzer = vectorizer.build_analyzer() def stemm(doc): stemmer = PorterStemmer() return (stemmer.stem(word) for word in analyzer(doc)) vectorizer.analyzer = stemm term_document_matrix = vectorizer.fit_transform( generate_bow__path_list) vocabulary = vectorizer.vocabulary_ return term_document_matrix, vocabulary