예제 #1
0
def vectorize(corpus):
        
        corpus_ = []
        for line in corpus:
                corpus_.append(set_sentence(line))

        stwf=stopwords.words('french')
        stwf.append('les')
        stwf.append('rt')


        vectorizer = CountVectorizer(stop_words=stwf,decode_error ="ignore")
        X = vectorizer.fit_transform(corpus_)
        return X.toarray()
예제 #2
0
def vectorize_(corpus):
    corpus_ = []
    for sentence in corpus:
        sentence = set_sentence(sentence)
        corpus_.append(sentence)
    return vectorizer.transform(corpus_)
예제 #3
0
def vectorize_(corpus):
	corpus_=[]
	for sentence in corpus:
		sentence=set_sentence(sentence)
		corpus_.append(sentence)
	return vectorizer.transform(corpus_)
예제 #4
0
import numpy as np
from preprocess import set_sentence
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords

########################################################
#     intput_: corpus.txt                              #
#                                                      #
#     example: python corpusvectorizer.py corpus.txt   #
########################################################
corpus=[]
input_ = sys.argv[1]
fs=open(input_,'r')

lines = fs.readlines()
for line in lines:
	line =set_sentence(line)
	corpus.append(line)

	
stwf=stopwords.words('french')
stwf.append('les')
stwf.append('rt')


vectorizer=CountVectorizer(stop_words=stwf,decode_error ="ignore")
X = vectorizer.fit_transform(corpus)
X = X.toarray()

print X