def recommend(search_word): movie_df = pre_process() tfv = vectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english') tfv_matrix = tfv.fit_transform(movie_df['bow']) sig = sigmoid_kernel(tfv_matrix, tfv_matrix) index = pd.Series(movie_df.index, index=movie_df['original_title']).drop_duplicates() try: idx = index[search_word] sig_scores = list(enumerate(sig[idx])) sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True) sig_scores = sig_scores[1:15] movie_indices = [i[0] for i in sig_scores] return list(movie_df['original_title'].iloc[movie_indices]) except: return None
def __init__(self, doc_dict): #self._n_features = 100000 self._tfidf_vectorizer = vectorizer(tokenizer=self.tokens, stop_words='english') self._doc_term_matr = self._tfidf_vectorizer.fit_transform( doc_dict.values()) #self.vect_length = len(self._tfidf_vectorizer.vocabulary_) self.vect_length = self._doc_term_matr.shape[1] #self._vect_list = [] self._id_list = [] for i, k in enumerate(doc_dict): #self._vect_list.append(self._doc_term_matr[i]) self._id_list.append(k)
def process(): global movie_df, sig, index movie_df = pre_process() tfv = vectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english') tfv_matrix = tfv.fit_transform(movie_df['overview']) sig = sigmoid_kernel(tfv_matrix, tfv_matrix) index = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates()
def recommend(search_word): movie_df = pre_process() tfv = vectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words='english') tfv_matrix = tfv.fit_transform(movie_df['overview']) sig = sigmoid_kernel(tfv_matrix, tfv_matrix) index = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates() try: title = search_word.lower() max_se = 0.0 name = '' for i in list(movie_df['title']): se = sm(None, title, i) if (se.ratio() > max_se): name = i max_se = se.ratio() idx = index[name] if (type(idx) == pd.core.series.Series): print(type(idx) == pd.core.series.Series) idx = index[name] idx = idx[[random.randint(0, (len(idx) - 1))]] idx = idx[name] else: idx = index[name] sig_scores = list(enumerate(sig[idx])) sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True) sig_scores = sig_scores[1:15] movie_indices = [i[0] for i in sig_scores] return list(movie_df['title'].iloc[movie_indices]) except: return None
print("Getting Contents From Test Files") path = folder + test_folder + pos_folder onlyfiles = listdir(path) pro_test = [open(path + f, 'r').read() for f in listdir(path)] path = folder + test_folder + neg_folder neg_test = [open(path + f, 'r').read() for f in listdir(path)] def bigram(text_file): text = open(text_file, 'r').read() tokens = nltk.word_tokenize(text) return list(nltk.bigrams(tokens)) vect = vectorizer() #vect.set_params(tokenizer=tokenizer.tokenize) # remove English stop words vect.set_params(stop_words='english') # include 1-grams and 2-grams print("Making vocabulary") vect.set_params(ngram_range=(2, 2)) X = vect.fit_transform(pro_train + neg_train) print("Vocabulary Made") X_Train = (X.toarray()) y = [1] * len(pro_train) + [0] * len(neg_train) print("Making Counts for Test Data") X_test = vect.transform(pro_test + neg_test) bayes = MultinomialNB()
def cluster(self): self.vector = vectorizer(use_idf=True) self.matrix = self.vector.fit_transform(self.clean) self.model = algorithm(n_clusters=self.clusters, n_init=1000) self.model.fit(self.matrix)
lowercase=True))])), ('clf', LinearSVC())]) chars = Pipeline([('features', FeatureUnion([('char', tfidf(analyzer='char', ngram_range=(3, 6), binary=False, max_df=1.0, min_df=2, norm='l2', sublinear_tf=True, use_idf=True, lowercase=True))])), ('clf', LinearSVC())]) simple = Pipeline([('features', vectorizer(lowercase=False, token_pattern=r'\b\w+\b', ngram_range=(1, 2))), ('clf', LogisticRegression())]) def neural(): return (_ for _ in ()).throw(Exception('NotImplementedError')) random = Pipeline([('features', tfidf()), ('clf', DummyClassifier(strategy='uniform', random_state=42))])
norm='l2', sublinear_tf=True, use_idf=True, lowercase=True))])), ('clf', LinearSVC())]) chars = Pipeline([('features', FeatureUnion([('char', tfidf(analyzer='char', ngram_range=(3, 6), binary=False, max_df=1.0, min_df=2, norm='l2', sublinear_tf=True, use_idf=True, lowercase=True))])), ('clf', LinearSVC())]) simple = Pipeline([('features', vectorizer(lowercase=False, token_pattern=r'\b\w+\b', ngram_range=(1,2))), ('clf', LogisticRegression())]) random = Pipeline([('features', tfidf()), ('clf', DummyClassifier(strategy='uniform', random_state=42))])