def vectorize(corpus): corpus_ = [] for line in corpus: corpus_.append(set_sentence(line)) stwf=stopwords.words('french') stwf.append('les') stwf.append('rt') vectorizer = CountVectorizer(stop_words=stwf,decode_error ="ignore") X = vectorizer.fit_transform(corpus_) return X.toarray()
def vectorize_(corpus): corpus_ = [] for sentence in corpus: sentence = set_sentence(sentence) corpus_.append(sentence) return vectorizer.transform(corpus_)
def vectorize_(corpus): corpus_=[] for sentence in corpus: sentence=set_sentence(sentence) corpus_.append(sentence) return vectorizer.transform(corpus_)
import numpy as np from preprocess import set_sentence from sklearn.feature_extraction.text import CountVectorizer import nltk from nltk.corpus import stopwords ######################################################## # intput_: corpus.txt # # # # example: python corpusvectorizer.py corpus.txt # ######################################################## corpus=[] input_ = sys.argv[1] fs=open(input_,'r') lines = fs.readlines() for line in lines: line =set_sentence(line) corpus.append(line) stwf=stopwords.words('french') stwf.append('les') stwf.append('rt') vectorizer=CountVectorizer(stop_words=stwf,decode_error ="ignore") X = vectorizer.fit_transform(corpus) X = X.toarray() print X