def convert(x): if x < -0.05: return 0 elif -0.05 < x < 0.05: return 1 else : return 2 # Labeling based on returned values data_df['label_stemmed'] = data_df['sentiment_stemmed'].apply(lambda x: convert(x['compound'])) #importing HashingVectorizer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.model_selection import train_test_split #hashing vectorization X= data_df['tweet_stemmed'] hashing_vectorizer = HashingVectorizer(stop_words = 'english',alternate_sign= False) hash_stem = hashing_vectorizer.fit_transform(X) y= data_df['label_stemmed'] #print("Data vectorized") #vectorization time Vectorizing_time = time.time() print("Vectorizing_time :",Vectorizing_time - start_time) #train and test set formed hashing_trainset = hash_stem[:319685, :] hashing_testset = hash_stem[319685:,:] x_train, x_test , y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2) x_train = hashing_trainset[y_train.index] x_test= hashing_trainset[y_test.index]
('clf', OneVsRestClassifier(LogisticRegression())) ]) # Import HashingVectorizer from sklearn.feature_extraction.text import HashingVectorizer # Get text data: text_data text_data=combine_text_columns(X_train) # Create the token pattern: TOKENS_ALPHANUMERIC TOKENS_ALPHANUMERIC='[A-Za-z0-9]+(?=\\s+)' # Instantiate the HashingVectorizer: hashing_vec hashing_vec=HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC) # Fit and transform the Hashing Vectorizer hashed_text=hashing_vec.fit_transform(text_data) # Create DataFrame and print the head hashed_df=pd.DataFrame(hashed_text.data) print(hashed_df.head()) # Import the hashing vectorizer from sklearn.feature_extraction.text import HashingVectorizer # Instantiate the winning model pipeline: pl pl=Pipeline([
print("%d documents" % len(dataset.data)) print("%d categories" % len(dataset.target_names)) print() labels = dataset.target true_k = np.unique(labels).shape[0] print("Extracting features from the training dataset " "using a sparse vectorizer") t0 = time() if opts.use_hashing: if opts.use_idf: # Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf)
def func(): hv = HashingVectorizer() hv.fit_transform(['hello world', np.nan, 'hello hello'])
from sklearn.feature_extraction.text import HashingVectorizer corpus = {'ass', 'bdfs', 'cer', 'dsssdf'} vectorizer = HashingVectorizer(n_features=14) print vectorizer.transform(corpus).todense()
print("%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" % (len(data_test.data), data_test_size_mb)) print("%d categories" % len(categories)) print() # split a training set and a test set y_train, y_test = data_train.target, data_test.target print("Extracting features from the training data using a sparse vectorizer") t0 = time() if opts.use_hashing: vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() X_test = vectorizer.transform(data_test.data)
def __init__(self): self.vectorizer = HashingVectorizer(n_features=2**4) self.H = {} self.model = None self.period = 1000
return self def transform(self, X): return X.toarray() # ------- Feature Builder ----------------- def is_list_or_tuple(obj): return isinstance(obj, tuple) or isinstance(obj, list) # Feature model specifications # For Chinese fm_spec = { 'hashing': HashingVectorizer(tokenizer=tokenize_zh), 'count': Count(ngram_range=(1, 5), min_df=5, max_df=0.9, max_features=4000, tokenizer=tokenize_zh), 'tfidf': ['count', Tfidf()], 'tfidf_dense': ['tfidf', SparseToDense()], 'lsa_200': ['tfidf', SVD(n_components=200)], 'lsa_500': ['tfidf', SVD(n_components=500)], 'lsa_1k': ['tfidf', SVD(n_components=1000)], 'lsa_500_minmax': ['lsa_500', MinMaxScaler()], 'lsa_1k_minmax': ['lsa_1k', MinMaxScaler()], # smaller vocabulary (removed more stop and infrequent words)
import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.io import ReadFromText, ReadAllFromText from dsba6155project.constants import Constants import os import re from collections import defaultdict import numpy as np from scipy.sparse import vstack from sklearn.feature_extraction.text import HashingVectorizer vectorizer = HashingVectorizer(n_features=20000, strip_accents='unicode', stop_words="english", norm=None) class PerformIncrementalPCA(beam.DoFn): def process(self, elem): ipca = IncrementalPCA(n_components=n_components, batch_size=10) X_ipca = ipca.fit_transform(X) class ReadBooks(beam.DoFn): def process(self, elem): return ReadFromTextWithFilename(elem) class Hashing(beam.DoFn):
def iterdocs(self): """Iterate doc by doc, yield a dict.""" for root, _dirnames, filenames in os.walk(self.data_path): for filename in fnmatch.filter(filenames, '*.sgm'): path = os.path.join(root, filename) parser = ReutersParser() for doc in parser.parse(open(path)): yield doc ############################################################################### # Main ############################################################################### # Create the hasher and limit the number of features to a reasonable maximum hasher = HashingVectorizer(charset_error='ignore', n_features=2**18) # Create an online classifier i.e. supporting `partial_fit()` classifier = SGDClassifier() # Create the data_streamer that parses Reuters SGML files and iterates on # documents as a stream data_streamer = ReutersStreamReader('reuters').iterdocs() # Here we propose to learn a binary classification between the positive class # and all other documents.""" all_classes = np.array([0, 1]) # NB: the 'acq' class was chosen as it is more or less evenly distributed in # the Reuters files. For other datasets, one should take care of creating a # test set with a realistic portion of positive instances. positive_class = 'acq'
def initHashVectorization(n_features=2**16): return HashingVectorizer(n_features=n_features)
def get_word_feature(Train,Test): hv = HashingVectorizer(n_features=80000, non_negative=True) vectorizer = make_pipeline(hv, TfidfTransformer()) train_feature = vectorizer.fit_transform(Train).toarray() test_feature = vectorizer.transform(Test) return train_feature,test_feature
print("%d documents" % len(dataset.data)) print("%d categories" % len(dataset.target_names)) print() labels = dataset.target true_k = np.unique(labels).shape[0] print( "Extracting features from the training dataset using a sparse vectorizer") t0 = time() if opts.use_hashing: if opts.use_idf: # Perform an IDF normalization on the output of HashingVectorizer hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=True, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf)
def wordVec(): dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42) print("%d documents" % len(dataset.data)) print("%d categories" % len(dataset.target_names)) print() labels = dataset.target print("Extracting features from the training dataset " "using a sparse vectorizer") t0 = time.clock() if sklearn.naive_bayes.check_X_y(): if sklearn.naive_bayes.safe_sparse_dot(): hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm=None) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm='l2') else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=sklearn.metrics.roc_curve()) X = vectorizer.fit_transform(dataset.data) print("done in %fs" % (time.time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) print() if True: print("Performing dimensionality reduction using LSA") t0 = time.time() svd = TruncatedSVD(sklearn.linear_model.SGDClassifier.predict()) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() if opts.minibatch: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=opts.verbose) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) print() if not opts.use_hashing: print("Top terms per cluster:") if opts.n_components: original_space_centroids = svd.inverse_transform( km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] else: order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print()
print(string) # open data-set and divide it data = pandas.read_csv( 'C:/Users/egedi/OneDrive/Belgeler/GitHub/ceng-407-408-2019-2020-Spam-SMS-Detection/SpamSmsDetection/spamSmsDetectionMobileApp/api/spam.csv', encoding='latin-1') learn = data[:4400] # 4400 items test = data[4400:] # 1172 items perform([ BernoulliNB(), RandomForestClassifier(n_estimators=100, n_jobs=-1), AdaBoostClassifier(), BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier(), CalibratedClassifierCV(), DummyClassifier(), PassiveAggressiveClassifier(), RidgeClassifier(), RidgeClassifierCV(), SGDClassifier(), OneVsRestClassifier(SVC(kernel='linear')), OneVsRestClassifier(LogisticRegression()), KNeighborsClassifier() ], [CountVectorizer(), TfidfVectorizer(), HashingVectorizer()], learn, test)
if x % 20 == 0: print 'building classifier number %d' % x ensemble.append(naive_bayes_builder(pos_sample, neg_sample, vectorizer)) votes = np.zeros(len(test_y)) for clf in ensemble: votes += clf.predict(test_x) votes = [1 if x > voting_threshhold else 0 for x in votes] print precision_recall_fscore_support(test_y, votes) vectorizer = HashingVectorizer(decode_error='ignore', n_features=2**18, non_negative=True, ngram_range=(1, 1)) test_x, test_y = create_test_set(test_pos, test_neg, vectorizer) create_ensemble(pos_sample=train_pos, neg_sample=train_neg, number_of_classifiers=500, vectorizer=vectorizer, voting_threshhold=250, test_x=test_x, test_y=test_y) # clf1 = naive_bayes_builder(train_pos, train_neg, vectorizer) # y_pred = clf1.predict(test_x)
cursor = None config = DBConfig(root_dir+"/db.ini").read_db_config() try: # Open database connection db = MySQLdb.connect(**config) # prepare a cursor object using cursor() method cursor = db.cursor() pd = sql.read_sql(queryGetClassifiedMembers, db) data = pd.replace(np.nan, '', regex=True) labels = pd.Type trainData = data.drop(['Type'], axis=1) email_pipe = Pipeline([ ('data', DataFrameColumnExtracter('Email')), ('vectorizer', HashingVectorizer(non_negative=True)) ]) fname_pipe = Pipeline([ ('data', DataFrameColumnExtracter('FirstName')), ('vectorizer', HashingVectorizer(non_negative=True)) ]) lname_pipe = Pipeline([ ('data', DataFrameColumnExtracter('Surname')), ('vectorizer', HashingVectorizer(non_negative=True)) ]) bio_pipe = Pipeline([ ('data', DataFrameColumnExtracter('Bio')), ('preprocessor', StripHTMLTransformer()),
'the', 'a', 'an', 'is', 'it', 'this', ]) # 'i', 'so', 'its', 'am', 'are']) vectorizer = FeatureUnion([ ('name', Pipeline([('select', ItemSelector('name', start_time=start_time)), ('transform', HashingVectorizer(ngram_range=(1, 2), n_features=2**27, norm='l2', lowercase=False, stop_words=stopwords)), ('drop_cols', DropColumnsByDf(min_df=2))])), ('category_name', Pipeline([ ('select', ItemSelector('category_name', start_time=start_time)), ('transform', HashingVectorizer(ngram_range=(1, 1), token_pattern='.+', tokenizer=split_cat, n_features=2**27, norm='l2', lowercase=False)), ('drop_cols', DropColumnsByDf(min_df=2)) ])),
print("Training :: %d documents - " % (len(data_train.data))) print("Testing :: %d documents - " % (len(data_test.data))) print("%d categories" % len(categories)) # split a training set and a test set y_train, y_test = data_train.target, data_test.target print( "For training : Extracting features from the training data using a sparse vectorizer" ) t0 = time() if False: #.use_hashing: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features) X_train = vectorizer.transform(data_train.data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, ngram_range=tuple([1, 3]), stop_words='english') X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs" % (duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print( "For testing : Extracting features from the test data using the same vectorizer"
:return: array of strings, array of ints array of strings contains nubmer of records array of ints contains correspoding number of labels """ docs, y = [], [] try: for _ in range(size): text, label = next(doc_stream) docs.append(text) y.append(label) except StopIteration: return None, None return docs, y vect = HashingVectorizer(decode_error='ignore', tokenizer=tokenizer, # Callable tokenizer function n_features=2**21) # Reduce hash collisions clf = SGDClassifier(loss='log', random_state=1, max_iter=1, tol=1e-3) doc_stream = stream_docs(path='movie_data.csv') # OUT-OF-CORE LEARNING classes = np.array([0, 1]) # Allocate 45000 records for the training for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) # Allocate 5000 records for the testing
for col in df.columns: print(df[col].dtypes) # 연관성높은 레시피의 recipeId 100개를 json형태로 반환한다. return jsonify(recomandResult=responseData) # lazy Loading을 하여 모델을 불러오는 시간을 단축한다. if __name__ == '__main__': #현재 인식할 수 있는 식재료들 labels = [ 'chilli', 'egg', 'pork meat', 'potato', 'pa', 'onion', 'carrot', 'cucumber' ] #벡터라이저 선언 vectorize = HashingVectorizer() # DB와의 연동하는 부분 # DB와의 연결을 생성 engine = create_engine( 'mysql://*****:*****@localhost:3307/testDB?charset=utf8', convert_unicode=True, encoding='UTF-8') # engine = create_engine('mysql://*****:*****@sts.c2yt44rkrmcp.us-east-2.rds.amazonaws.com:3306/finalproject?charset=utf8', convert_unicode=True,encoding='UTF-8') conn = engine.connect() # recipe 테이블을 읽어온다. data = pd.read_sql_table('recipe', conn) # 누락된 값을 0으로 전환 data = data.fillna(0) # json형태로 반환하기 위해 int32 or int64 형태의 변수를 float로 전환 data["id"] = data['id'].astype("float")
reviews_wvs_test = yelp.to_word_level_idx(test_reviews, global_gb, WORDS_PER_TEXT) # -- testing data save np.save('Yelp_useful_test_fulltext_glove_300_X.npy', reviews_wvs_test) np.save('Yelp_useful_test_fulltext_glove_300_y.npy', test_labels) reviews_wvs_test = yelp.to_word_level_idx(test_reviews, yelp_gb, WORDS_PER_TEXT) # -- testing data save np.save('Yelp_useful_test_fulltext_Yelp_glove_300_X.npy', reviews_wvs_test) np.save('Yelp_useful_test_fulltext_Yelp_glove_300_y.npy', test_labels) del reviews_wvs_test log('Hashing BOW features, might be used by some NN models') hv = HashingVectorizer( n_features=BOW_HASH_DIMENSION) # Int: maybe try without normalization train_bow_hash = hv.transform(train_reviews) test_bow_hash = hv.transform(test_reviews) np.save('Yelp_useful_train_hashbow.npy', train_bow_hash.todense()) np.save('Yelp_useful_test_hashbow.npy', test_bow_hash.todense()) ################################## ### YELP FUNNY ################################## log('Saving "funny" votes data') (train_reviews, train_labels, test_reviews, test_labels) = \ yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST) reviews_wvs_train = yelp.to_word_level_idx(train_reviews, global_gb, WORDS_PER_TEXT) # -- training data save
from sklearn.feature_extraction.text import HashingVectorizer import re import os import pickle cur_dir = os.path.dirname(__file__) stop = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'stopwords.pkl'), 'rb')) def tokenizer(text): text = re.sub('<[^>]*>', '', text) emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|p)', text.lower()) text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '') tokenized = [w for w in text.split() if w not in stop] return tokenized vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
def main(): start_time = time.time() train = pd.read_table('../input/train.tsv', engine='c') test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_train = train.shape[0] y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, test]) submission: pd.DataFrame = test[['test_id']] del train del test gc.collect() handle_missing_inplace(merge) print('[{}] Finished to handle missing'.format(time.time() - start_time)) cutting(merge) print('[{}] Finished to cut'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Finished to convert categorical'.format(time.time() - start_time)) merge.item_description = merge.item_description.str.lower() normalize_desc(merge) print('[{}] Finished to normalize'.format(time.time() - start_time)) # handle_no_description(merge) # print('[{}] Finished to copy names to missing desc'.format(time.time() - start_time)) cv = CountVectorizer(min_df=NAME_MIN_DF) X_name = cv.fit_transform(merge['name']) print('[{}] Finished count vectorize `name`'.format(time.time() - start_time)) cv = CountVectorizer() X_category = cv.fit_transform(merge['category_name']) print('[{}] Finished count vectorize `category_name`'.format(time.time() - start_time)) #FF from nltk.corpus import stopwords interesting_words = [ 'new', 'perfect', 'fit', 'used', #'super', 'cute', 'excellent', 'great', 'retail', '[rm]', 'never used', 'bundle', #'diamond', 'ruby', 'platinum', 'gold', 'set', 'case', 'unused', 'unopened', 'sealed' ] X_intcol = pd.DataFrame() for word in interesting_words: X_intcol[word] = merge['item_description'].apply(lambda x: word in x) X_des = merge['item_description'].apply(lambda x: len(x)).astype('float32') X_des = X_des[:, np.newaxis] scaler = MaxAbsScaler() X_des = scaler.fit_transform(X_des) ignore_words = [ 'cant', 'ask', 'size', 'inch', 'inches', 'already', 'inside', 'easy' ] stop = stopwords.words('english') + ignore_words #FF #FF version 5 n_features=2**18 --> n_features=2**17 hv = HashingVectorizer(input='content', stop_words=stop, n_features=2**17, lowercase=False) X_description = hv.transform(merge['item_description']) print( '[{}] Finished Hash vectorize `item_description`'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Finished label binarize `brand_name`'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'. format(time.time() - start_time)) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category, X_name, X_intcol, X_des)).tocsr() print('[{}] Finished to create sparse merge'.format(time.time() - start_time)) X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_train:] # def rmsle(y, y0): # assert len(y) == len(y0) # return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2))) modelR1 = Ridge(solver="sag", fit_intercept=True, random_state=2, alpha=4, tol=0.0006, max_iter=800) modelR1.fit(X, y) print('[{}] Finished to train ridge sag'.format(time.time() - start_time)) predsR = modelR1.predict(X=X_test) print('[{}] Finished to predict ridge sag'.format(time.time() - start_time)) modelR2 = Ridge(solver="sag", fit_intercept=True, random_state=145, alpha=0.4) modelR2.fit(X, y) print('[{}] Finished to train ridge lsqrt'.format(time.time() - start_time)) predsR2 = modelR2.predict(X=X_test) print('[{}] Finished to predict ridge lsqrt'.format(time.time() - start_time)) train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.01, random_state=144) d_train = lgb.Dataset(train_X, label=train_y, max_bin=8192) d_valid = lgb.Dataset(valid_X, label=valid_y, max_bin=8192) watchlist = [d_train, d_valid] params = { 'learning_rate': 0.56, 'application': 'regression', 'max_depth': 5, 'num_leaves': 40, 'verbosity': -1, 'metric': 'RMSE', 'feature_fraction': 0.8, #cambiato da 0.6 'nthread': 4 } params2 = { 'learning_rate': 0.85, 'application': 'regression', 'max_depth': 3, 'num_leaves': 110, 'verbosity': -1, 'metric': 'RMSE', 'nthread': 4 } modelL1 = lgb.train(params, train_set=d_train, num_boost_round=8000, valid_sets=watchlist, \ early_stopping_rounds=None, verbose_eval=500) predsL = modelL1.predict(X_test) print('[{}] Finished to predict lgb 1'.format(time.time() - start_time)) train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X, y, test_size=0.01, random_state=101) d_train2 = lgb.Dataset(train_X2, label=train_y2, max_bin=8192) d_valid2 = lgb.Dataset(valid_X2, label=valid_y2, max_bin=8192) watchlist2 = [d_train2, d_valid2] modelL2 = lgb.train(params2, train_set=d_train2, num_boost_round=3200, valid_sets=watchlist2, \ early_stopping_rounds=None, verbose_eval=500) predsL2 = modelL2.predict(X_test) print('[{}] Finished to predict lgb 2'.format(time.time() - start_time)) preds = predsR2 * 0.2 + predsR * 0.1 + predsL * 0.5 + predsL2 * 0.2 submission['price'] = np.expm1(preds) submission.to_csv("submission_lgbm_ridge_8.csv", index=False)
def vectorize_hash(mails): vec = HashingVectorizer(n_features=2**10) data = vec.fit_transform(mails) return (vec, data)
# y_train = [['New York'],['New York'],['New York'],['New York'],['New York'],['New York'],['London'],['London'], ['London'],['London'],['London'],['London'],['New York', 'London'],['New York', 'London'] ] y_train = mlb.fit_transform(y_train) # print y_train print "classes", list(mlb.classes_) print len(list(mlb.classes_)) print len(X_train) # print "-----Binarize y_train----------" # print y_train # Pipeline(vectorization, tfid weighting and classifier) # ppl = Pipeline([ # ('vectorizer', HashingVectorizer()), # ('tfidf', TfidfTransformer()), # ('clf', OneVsRestClassifier(LinearSVC()))]) ppl = Pipeline([('vectorizer', HashingVectorizer()), ('clf', OneVsRestClassifier(LinearSVC()))]) ppl.fit(X_train, y_train) # # Test # X_test = np.array(['nice day in nyc', # 'welcome to london', # 'hello welcome to new york. enjoy it here and london too']) # target_names = ['New York', 'London'] # index--> names y_predict = ppl.predict(X_test) labels_predicted = mlb.inverse_transform(y_predict) print labels_predicted # print "\tInverting binary encoding......" # print "=============y_predicted============" # print labels_predicted
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b') def pre_processing(row): first_process = re.sub(combined_pat, '', row) second_process = re.sub(www_pat, '', first_process) third_process = second_process.lower() fourth_process = neg_pattern.sub(lambda x: negations_dic[x.group()], third_process) result = re.sub(r'[^A-Za-z ]','',fourth_process) return result.strip() df.ws_content = df.ws_content.apply(pre_processing) print("Preprocessed") print(df.shape) vectorizer = HashingVectorizer(stop_words="english", ngram_range=(2,5), n_features=75000) text_vector = vectorizer.fit_transform(df.ws_content) print("text hashed") print("text vector:", text_vector.shape) tfifd_vectorizer = TfidfTransformer() tfifd_vector = tfifd_vectorizer.fit_transform(text_vector) scaler = MinMaxScaler() int_values = df[["time_since_col", "stock_price_col"]]
def __init__(self, MAX_LEN, vec='hash', aug=True): self.arrGood = [ 'absolutely', 'abundant', 'accept', 'acclaimed', 'accomplishment', 'achievement', 'action', 'active', 'activist', 'acumen', 'adjust', 'admire', 'adopt', 'adorable', 'adored', 'adventure', 'affirmation', 'affirmative', 'affluent', 'agree', 'airy', 'alive', 'alliance', 'ally', 'alter', 'amaze', 'amity', 'animated', 'answer', 'appreciation', 'approve', 'aptitude', 'artistic', 'assertive', 'astonish', 'astounding', 'astute', 'attractive', 'authentic', 'basic', 'beaming', 'beautiful', 'believe', 'benefactor', 'benefit', 'bighearted', 'blessed', 'bliss', 'bloom', 'bountiful', 'bounty', 'brave', 'bright', 'brilliant', 'bubbly', 'bunch', 'burgeon', 'calm', 'care', 'celebrate', 'certain', 'change', 'character', 'charitable', 'charming', 'cheer', 'cherish', 'clarity', 'classy', 'clean', 'clever', 'closeness', 'commend', 'companionship', 'complete', 'comradeship', 'confident', 'connect', 'connected', 'constant', 'content', 'conviction', 'copious', 'core', 'coupled', 'courageous', 'creative', 'cuddle', 'cultivate', 'cure', 'curious', 'cute', 'dazzling', 'delight', 'direct', 'discover', 'distinguished', 'divine', 'donate', 'eager', 'earnest', 'easy', 'ecstasy', 'effervescent', 'efficient', 'effortless', 'electrifying', 'elegance', 'embrace', 'encompassing', 'encourage', 'endorse', 'energized', 'energy', 'enjoy', 'enormous', 'enthuse', 'enthusiastic', 'entirely', 'essence', 'established', 'esteem', 'everyday', 'everyone', 'excited', 'exciting', 'exhilarating', 'expand', 'explore', 'express', 'exquisite', 'exultant', 'faith', 'familiar', 'family', 'famous', 'feat', 'fit', 'flourish', 'fortunate', 'fortune', 'freedom', 'fresh', 'friendship', 'full', 'funny', 'gather', 'generous', 'genius', 'genuine', 'give', 'glad', 'glow', 'good', 'gorgeous', 'grace', 'graceful', 'gratitude', 'green', 'grin', 'group', 'grow', 'handsome', 'happy', 'harmony', 'healed', 'healing', 'healthful', 'healthy', 'heart', 'hearty', 'heavenly', 'helpful', 'here', 'hold', 'holy', 'honest', 'honor', 'hug', 'idea', 'ideal', 'imaginative', 'increase', 'incredible', 'independent', 'ingenious', 'innate', 'innovate', 'inspire', 'instantaneous', 'instinct', 'intellectual', 'intelligence', 'intuitive', 'inventive', 'joined', 'jovial', 'joy', 'jubilation', 'keen', 'key', 'kind', 'kiss', 'knowledge', 'laugh', 'leader', 'learn', 'legendary', 'light', 'lively', 'love', 'loveliness', 'lucidity', 'lucrative', 'luminous', 'maintain', 'marvelous', 'master', 'meaningful', 'meditate', 'mend', 'metamorphosis', 'mind-blowing', 'miracle', 'mission', 'modify', 'motivate', 'moving', 'natural', 'nature', 'nourish', 'nourished', 'novel', 'now', 'nurture', 'nutritious', 'one', 'open', 'openhanded', 'optimistic', 'paradise', 'party', 'peace', 'perfect', 'phenomenon', 'pleasure', 'plenteous', 'plentiful', 'plenty', 'plethora', 'poise', 'polish', 'popular', 'positive', 'powerful', 'prepared', 'pretty', 'principle', 'productive', 'project', 'prominent', 'prosperous', 'protect', 'proud', 'purpose', 'quest', 'quick', 'quiet', 'ready', 'recognize', 'refinement', 'refresh', 'rejoice', 'rejuvenate', 'relax', 'reliance', 'rely', 'remarkable', 'renew', 'renowned', 'replenish', 'resolution', 'resound', 'resources', 'respect', 'restore', 'revere', 'revolutionize', 'rewarding', 'rich', 'robust', 'rousing', 'safe', 'secure', 'see', 'sensation', 'serenity', 'shift', 'shine', 'show', 'silence', 'simple', 'sincerity', 'smart', 'smile', 'smooth', 'solution', 'soul', 'sparkling', 'spirit', 'spirited', 'spiritual', 'splendid', 'spontaneous', 'still', 'stir', 'strong', 'style', 'success', 'sunny', 'support', 'sure', 'surprise', 'sustain', 'synchronized', 'team', 'thankful', 'therapeutic', 'thorough', 'thrilled', 'thrive', 'today', 'together', 'tranquil', 'transform', 'triumph', 'trust', 'truth', 'unity', 'unusual', 'unwavering', 'upbeat', 'value', 'vary', 'venerate', 'venture', 'very', 'vibrant', 'victory', 'vigorous', 'vision', 'visualize', 'vital', 'vivacious', 'voyage', 'wealthy', 'welcome', 'well', 'whole', 'wholesome', 'willing', 'wonder', 'wonderful', 'wondrous', 'xanadu', 'yes', 'yippee', 'young', 'youth', 'youthful', 'zeal', 'zest', 'zing', 'zip' ] self.arrBad = [ 'acrotomophilia', 'anal', 'anilingus', 'anus', 'arsehole', 'ass', 'asshole', 'assmunch', 'autoerotic', 'babeland', 'bangbros', 'bareback', 'barenaked', 'bastardo', 'bastinado', 'bbw', 'bdsm', 'bestiality', 'bimbos', 'birdlock', 'bitch', 'blumpkin', 'bollocks', 'bondage', 'boner', 'boob', 'boobs', 'bukkake', 'bulldyke', 'bunghole', 'busty', 'butt', 'buttcheeks', 'butthole', 'camgirl', 'camslut', 'camwhore', 'carpetmuncher', 'circlejerk', 'c**t', 'clitoris', 'clusterfuck', 'c**k', 'cocks', 'coprolagnia', 'coprophilia', 'cornhole', 'cum', 'cumming', 'cunnilingus', 'c**t', 'darkie', 'daterape', 'deepthroat', 'dick', 'd***o', 'doggiestyle', 'doggystyle', 'dolcett', 'domination', 'dominatrix', 'dommes', 'ecchi', 'e*********n', 'erotic', 'erotism', 'escort', 'eunuch', 'f****t', 'fecal', 'felch', 'f******o', 'feltch', 'femdom', 'figging', 'fingering', 'fisting', 'footjob', 'frotting', 'f**k', 'f*****g', 'f*********r', 'futanari', 'gay', 'genitals', 'goatcx', 'g****e', 'gokkun', 'goodpoop', 'goregasm', 'grope', 'guro', 'handjob', 'hardcore', 'hentai', 'homoerotic', 'honkey', 'hooker', 'kill', 'murder', 'fat', 'humping', 'incest', 'intercourse', 'jack', 'jerk', 'jigaboo', 'jiggaboo', 'jiggerboo', 'j**z', 'juggs', 'kike', 'kinbaku', 'kinkster', 'kinky', 'knobbing', 'lolita', 'lovemaking', 'm********e', 'm**********r', 'muffdiving', 'nambla', 'nawashi', 'negro', 'neonazi', 'n***a', 'nigger', 'nimphomania', 'nipple', 'nipples', 'nude', 'nudity', 'nympho', 'nymphomania', 'octopussy', 'omorashi', 'o****m', 'orgy', 'paedophile', 'panties', 'panty', 'pedobear', 'pedophile', 'pegging', 'penis', 'pissing', 'pisspig', 'playboy', 'ponyplay', 'poof', 'poopchute', 'p**n', 'porno', 'pornography', 'pthc', 'pubes', 'pussy', 'queaf', 'raghead', 'rape', 'raping', 'rapist', 'rectum', 'cowgirl', 'rimjob', 'rimming', 'sadism', 'scat', 'schlong', 'scissoring', 's***n', 'sex', 'sexo', 'sexy', 'beaver', 'pussy', 'shemale', 'shibari', 'shit', 'shota', 'shrimping', 'slanteye', 's**t', 'smut', 'snatch', 'snowballing', 'sodomize', 'sodomy', 'spic', 'spooge', 'strapon', 'strappado', 'strip', 'suck', 'sucks', 'suicide', 'sultry', 'swastika', 'swinger', 'threesome', 'throating', 'tit', 't**s', 'titties', 'titty', 'topless', 'tosser', 'towelhead', 'tranny', 'tribadism', 'tubgirl', 'tushy', 'twat', 'twink', 'twinkie', 'undressing', 'upskirt', 'urophilia', 'v****a', 'vibrator', 'vorarephilia', 'voyeur', 'vulva', 'wank', 'wetback', 'xx', 'xxx', 'yaoi', 'yiffy', ] self.CHARS_TO_REMOVE = '!¡"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—' self.MAX_LEN = MAX_LEN self.STOP_WORDS = list(stopwords.words('english')) self.num_partitions = psutil.cpu_count() * 8 self.num_cores = psutil.cpu_count() # self.stemmer = PorterStemmer() self.aug = aug if vec == 'hash': self.vectorizer = HashingVectorizer(n_features=self.MAX_LEN, analyzer='word', lowercase=False) else: self.vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.3, strip_accents='unicode', use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=self.MAX_LEN, lowercase=False)
X = np.array(["numpy", "scipy", "sklearn"]) vectorizer = TfidfVectorizer(dtype=vectorizer_dtype) warning_msg_match = "'dtype' should be used." warning_cls = UserWarning expected_warning_cls = warning_cls if warning_expected else None with pytest.warns(expected_warning_cls, match=warning_msg_match) as record: X_idf = vectorizer.fit_transform(X) if expected_warning_cls is None: relevant_warnings = [w for w in record if isinstance(w, warning_cls)] assert len(relevant_warnings) == 0 assert X_idf.dtype == output_dtype @pytest.mark.parametrize("vec", [ HashingVectorizer(ngram_range=(2, 1)), CountVectorizer(ngram_range=(2, 1)), TfidfVectorizer(ngram_range=(2, 1)) ]) def test_vectorizers_invalid_ngram_range(vec): # vectorizers could be initialized with invalid ngram range # test for raising error message invalid_range = vec.ngram_range message = ("Invalid value for ngram_range=%s " "lower boundary larger than the upper boundary." % str(invalid_range)) if isinstance(vec, HashingVectorizer) and IS_PYPY: pytest.xfail(reason='HashingVectorizer is not supported on PyPy') assert_raise_message(ValueError, message, vec.fit, ["good news everyone"]) assert_raise_message(ValueError, message, vec.fit_transform,
def __init__(self, m): self.fh = FeatureHasher(n_features = m, input_type = 'string') self.hv = HashingVectorizer(n_features = m)