Exemplos de TfidfVectorizer.build_vocab em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: sklearn.feature_extraction.text

Classe / Tipo: TfidfVectorizer

Método / Função: build_vocab

Exemplos em hotexamples.com: 2

TfidfVectorizer.build_vocab em Python - 2 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de sklearn.feature_extraction.text.TfidfVectorizer.build_vocab em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

fit(30)

get_stop_words(30)

TfidfVectorizer(30)

fit_transform(30)

get_feature_names(30)

inverse_transform(30)

build_analyzer(30)

build_tokenizer(29)

get_params(29)

get_feature_names_out(14)

__init__(12)

idf_(11)

build_preprocessor(8)

max_features(8)

_validate_vocabulary(3)

max_df(3)

fir(2)

N_(2)

fit_on_texts(2)

build_vocab(2)

decode(2)

_tfidf(2)

decode_error(1)

append(1)

_document_frequency(1)

_get_param_names(1)

kneighbors(1)

join(1)

_stop_words_id(1)

inv_vocabulary_(1)

input(1)

infer_vector(1)

idx_target_cache(1)

get_word_net_feature_vecs(1)

bert(1)

get_shape(1)

encode(1)

get_feautre_names(1)

cate_set(1)

get_feature_name(1)

fit_transfrorm(1)

fit_transfrom(1)

count(1)

fit_trainsform(1)

count_args(1)

count_chunks(1)

encoding(1)

mean(1)

Métodos Frequentes

fit (30)

get_stop_words (30)

TfidfVectorizer (30)

fit_transform (30)

get_feature_names (30)

inverse_transform (30)

build_analyzer (30)

build_tokenizer (29)

get_params (29)

get_feature_names_out (14)

Métodos Frequentes

__init__ (12)

idf_ (11)

build_preprocessor (8)

max_features (8)

_validate_vocabulary (3)

max_df (3)

fir (2)

N_ (2)

fit_on_texts (2)

build_vocab (2)

decode (2)

_tfidf (2)

decode_error (1)

append (1)

_document_frequency (1)

_get_param_names (1)

kneighbors (1)

join (1)

_stop_words_id (1)

inv_vocabulary_ (1)

Métodos Frequentes

decode (2)

_tfidf (2)

decode_error (1)

append (1)

_document_frequency (1)

_get_param_names (1)

kneighbors (1)

join (1)

_stop_words_id (1)

inv_vocabulary_ (1)

input (1)

infer_vector (1)

idx_target_cache (1)

get_word_net_feature_vecs (1)

bert (1)

get_shape (1)

encode (1)

get_feautre_names (1)

cate_set (1)

get_feature_name (1)

fit_transfrorm (1)

fit_transfrom (1)

count (1)

fit_trainsform (1)

count_args (1)

count_chunks (1)

encoding (1)

mean (1)

Métodos Frequentes

input (1)

infer_vector (1)

idx_target_cache (1)

get_word_net_feature_vecs (1)

bert (1)

get_shape (1)

encode (1)

get_feautre_names (1)

cate_set (1)

get_feature_name (1)

fit_transfrorm (1)

fit_transfrom (1)

count (1)

fit_trainsform (1)

count_args (1)

count_chunks (1)

encoding (1)

mean (1)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: preprocess.py Projeto: qihui-zhang-debug/ESSEC-CS-DSBA2019-ELTA

class sentence2vector: def __init__(self, sentences, method='TF-IDF', vector_size=100, min_count=2): import spacy try: self.spacy_nlp = spacy.load("fr") except: #! spacy download fr self.spacy_nlp = spacy.load("fr_core_news_sm") self.unknown_token = '<ukn>' self.sentences = sentences self.method = method self.vector_size = vector_size self.min_count = min_count print('Size of documents:', len(self.sentences)) print('Method of vectorization:', self.method) self.preprocessing() self.count_word() self.vectorize() def preprocessing(self): print('Preprocessing sentences...') try: with tqdm(self.sentences) as t: for i, _ in enumerate(t): self.sentences[i] = self.raw_to_tokens(self.sentences[i]) except KeyboardInterrupt: t.close() raise t.close() def normalize_accent(self, string): string = string.replace('á', 'a') string = string.replace('â', 'a') string = string.replace('é', 'e') string = string.replace('è', 'e') string = string.replace('ê', 'e') string = string.replace('ë', 'e') string = string.replace('î', 'i') string = string.replace('ï', 'i') string = string.replace('ö', 'o') string = string.replace('ô', 'o') string = string.replace('ò', 'o') string = string.replace('ó', 'o') string = string.replace('ù', 'u') string = string.replace('û', 'u') string = string.replace('ü', 'u') string = string.replace('ç', 'c') return string def raw_to_tokens(self, raw_string): # Write code for lower-casing string = raw_string.lower() # Write code to normalize the accents string = self.normalize_accent(string) # Write code to tokenize string = self.spacy_nlp(string) # Write code to remove punctuation tokens, stop words , digits and create string tokens string = [ token.orth_ for token in string if not token.is_punct if not token.is_stop if token.orth_.isalpha() ] # Write code to join the tokens back into a single string #clean_string = " ".join(string_tokens) return string def vectorize(self): if self.method == 'TF-IDF': self.tfidf() if self.method == 'doc2vec': self.doc2vec() def tfidf(self): from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import PCA print('Transform TF-IDF vectors...') #create a TfidfVectorizer object self.vectorizer = TfidfVectorizer(min_df=self.min_count) # vectorize the text x = [" ".join(sentence) for sentence in self.sentences] sparse_result = self.vectorizer.fit_transform(x) self.vocabulary = self.vectorizer.vocabulary_ print('Vocabulary size:', len(self.vocabulary)) self.X = sparse_result.toarray() #reduce feature dimension of X pca = PCA(n_components=self.vector_size, copy=False) self.X = pca.fit_transform(self.X) def doc2vec(self): from gensim.models.doc2vec import Doc2Vec, TaggedDocument documents = [ TaggedDocument(doc, [i]) for i, doc in enumerate(self.sentences) ] print('Training Doc2vec model...') self.vectorizer = Doc2Vec(vector_size=self.vector_size, window=5, min_count=self.min_count, hs=0, negative=5, workers=-1, alpha=0.025, min_alpha=1e-5) self.vectorizer.build_vocab(documents) self.vocabulary = self.vectorizer.wv.vocab print('Vocabulary size:', len(self.vocabulary)) self.vectorizer.train(documents, total_examples=self.vectorizer.corpus_count, epochs=self.vectorizer.epochs) self.X = np.array( [self.vectorizer[i] for i in range(len(self.sentences))]) def count_word(self): print('Building word2count dict...') self.word2count = {} try: with tqdm(self.sentences) as t: for sentence in t: for word in sentence: if word in self.word2count: self.word2count[word] += 1 else: self.word2count[word] = 1 except KeyboardInterrupt: t.close() raise t.close() def __getitem__(self, key): if self.method == 'TF-IDF': vec = self.vectorizer[key].toarray().squeeze() else: vec = self.vectorizer[key] return vec def __len__(self): return len(self.sentences)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: vectorizer.py Projeto: billsioros/twitter-sentiment-analysis

class Vectorizer: vector_size = 300 bowargs = { "max_features": vector_size, "stop_words" : 'english', "max_df" : 0.5, "min_df" : 0.01 } tfidfargs = { "max_df" : 1.0, "min_df" : 1, "max_features" : vector_size, "stop_words" : 'english' } w2vargs = { "size" : vector_size, "window" : 5, "min_count" : 2, "sg" : 1, "hs" : 0, "negative" : 10, "workers" : 2, "seed" : 34 } supported_methods = { 'word2vec', 'bagofwords', 'tfidf' } def __init__(self, method='word2vec'): self.method = re.sub(r'''_|-|\ ''', '', method) if self.method == 'word2vec': self.underlying = Word2Vec(**self.w2vargs) elif self.method == 'bagofwords': self.underlying = CountVectorizer(**self.bowargs) elif self.method == 'tfidf': self.underlying = TfidfVectorizer(**self.tfidfargs) else: raise ValueError("'" + self.method + "' is not supported") def vectorize(self, preprocessor, dictionary, save=True): if isinstance(preprocessor, list): path = platform.filename(preprocessor, ['preprocessed', self.method] + (['augmented'] if dictionary else [])) + '.pkl' if not os.path.isfile(path): raise ValueError("'" + path + "' is not a file") with open(path, 'rb') as file: labels, vectors = pickle.load(file) print('<LOG>: Loaded', len(vectors), 'vectors from', path, '[' + str(len(list(vectors.values())[0])), 'features each]', file=sys.stderr) return dict(zip(vectors.keys(), labels)), vectors path = '_'.join([preprocessor.path, self.method] + (['augmented'] if dictionary else [])) + '.pkl' if not isinstance(preprocessor, Preprocessor): raise ValueError("'preprocessor' is not an instance of 'Preprocessor'") return self.process(preprocessor, dictionary, path if save else None) def process(self, preprocessor, dictionary, path): tweets = list(preprocessor.tweets.values()) if self.method == 'word2vec': self.underlying.build_vocab(tweets) self.underlying.train(sentences=tweets, total_examples=len(tweets), epochs=20) vectors = [None] * len(tweets) for i, tweet in enumerate(tweets): vector = [None] * len(tweet) for j, token in enumerate(tweet): if token in self.underlying.wv: vector[j] = self.underlying.wv[token] else: vector[j] = 2.0 * np.random.randn(self.vector_size) - 1.0 vectors[i] = np.mean(vector, axis=0) else: concatenated = [' '.join(tweet) for tweet in tweets] vectors = self.underlying.fit_transform(concatenated).toarray() if dictionary: flattened = list(np.asarray(vectors).flatten()) vmin, vmax = min(flattened), max(flattened) augmented = [None] * len(vectors) for i, valences in enumerate(dictionary.per_tweet(tweets, (vmin, vmax))): augmented[i] = np.concatenate((vectors[i], valences)) vectors = augmented print('<LOG>: The', ('augmented ' if augmented else '') + 'vectors\' values are in the range', '[' + '{0:.4f}'.format(vmin), ',', '{0:.4f}'.format(vmax) + ']', file=sys.stderr) vectors = dict(zip(preprocessor.tweets.keys(), vectors)) if path: with open(path, 'wb') as file: pickle.dump((list(preprocessor.labels.values()), vectors), file) print('<LOG>: Saved', len(vectors), 'vectors to', path, '[' + str(len(list(vectors.values())[0])), 'features each]', file=sys.stderr) return preprocessor.labels, vectors