def predict(): ''' get current sound data and predict ''' y = sound.get_data() y1 = [0] _yy = [0, 0, 0, 0, 0, 0] if y: _y = list(y) y1 = model.predict(preprocessor(y)).tolist()[0] _yy = model.predict_proba(preprocessor(_y)).tolist()[0] # probas y2 = _yy[0] y3 = _yy[1] y4 = _yy[2] y5 = _yy[3] y6 = _yy[4] y7 = _yy[5] rv = jsonify(points=[y1, y2, y3, y4, y5, y6, y7]) return rv
def main(): cfg = Config() data_dir = '/kaggle/input/lish-moa' save_path = './' load_path = '../input/model-resnet-tensorflow' runty = 'eval' assert runty == 'traineval' or runty == 'eval', \ "Run type is wrong. Should be 'traineval' or 'eval'" train_features = pd.read_csv(os.path.join(data_dir, 'train_features.csv')) train_targets_scored = pd.read_csv( os.path.join(data_dir, 'train_targets_scored.csv')) train_targets_nonscored = pd.read_csv( os.path.join(data_dir, 'train_targets_nonscored.csv')) test_features = pd.read_csv(os.path.join(data_dir, 'test_features.csv')) sub = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv')) train_targets_scored = train_targets_scored.drop(['sig_id'], axis=1) train_targets_nonscored = train_targets_nonscored.drop(['sig_id'], axis=1) non_ctl_idx = train_features.loc[ train_features['cp_type'] != 'ctl_vehicle'].index.to_list() train_features = train_features.drop( ['sig_id', 'cp_type', 'cp_time', 'cp_dose'], axis=1) train_features = train_features.iloc[non_ctl_idx] train_targets_scored = train_targets_scored.iloc[non_ctl_idx] train_targets_nonscored = train_targets_nonscored.iloc[non_ctl_idx] test_features = test_features.drop(['sig_id', 'cp_dose', 'cp_time'], axis=1) gs = train_features.columns.str.startswith('g-') cs = train_features.columns.str.startswith('c-') # read the main predictors with open('../input/src-resnet-tensorflow/main_predictors.json') as f: tmp = json.load(f) preds = tmp['start_predictors'] oof = tf.constant(0.0) predictions = np.zeros( (test_features.shape[0], train_targets_scored.shape[1])) for seed in cfg.seeds: mskf = MultilabelStratifiedKFold(n_splits=cfg.nfolds, shuffle=True, random_state=seed) for f, (t_idx, v_idx) in enumerate( mskf.split(X=train_features, y=train_targets_scored)): x_train, x_valid = preprocessor(train_features.iloc[t_idx].values, train_features.iloc[v_idx].values, gs, cs) _, data_test = preprocessor( train_features.iloc[t_idx].values, test_features.drop('cp_type', axis=1).values, gs, cs) x_train_2, x_valid_2 = \ preprocessor_2(train_features.iloc[t_idx][preds].values, train_features.iloc[v_idx][preds].values) _, data_test_2 = preprocessor_2( train_features.iloc[t_idx][preds].values, test_features[preds].values) y_train_sc = train_targets_scored.iloc[t_idx].values y_train_ns = train_targets_nonscored.iloc[t_idx].values y_valid_sc = train_targets_scored.iloc[v_idx].values y_valid_ns = train_targets_nonscored.iloc[v_idx].values n_features = x_train.shape[1] n_features_2 = x_train_2.shape[1] trte = train_test(x_train=x_train, x_valid=x_valid, data_test=data_test, x_train_2=x_train_2, x_valid_2=x_valid_2, data_test_2=data_test_2, y_train_sc=y_train_sc, y_train_ns=y_train_ns, y_valid_sc=y_valid_sc, y_valid_ns=y_valid_ns, save_path=save_path, load_path=load_path, fold=f, runty=runty) y_val, predictions_ = trte.run_k_fold(seed) oof += logloss(tf.constant(y_valid_sc, dtype=tf.float32), tf.constant(y_val, dtype=tf.float32)) / ( cfg.nfolds * len(cfg.seeds)) predictions += predictions_ / (cfg.nfolds * len(cfg.seeds)) print("CV log_loss: ", oof) target_cols = train_targets_scored.columns sub.iloc[:, 1:] = predictions sub.loc[test_features['cp_type'] == 'ctl_vehicle', sub.columns[1:]] = 0 # clip the submission # sub_c = sub_clip(sub, test_features) # sub_c.to_csv('submission.csv', index=False) # sub.loc[test_features['cp_type']=='ctl_vehicle', submission.columns[1:]] = 0 sub.to_csv('submission_resnet.csv', index=False) """ if (runty == 'train'):
# This file creates the 'pipe' NLP model and saves it as model.joblib # Import libraries import pandas as pd import joblib from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC from sklearn.feature_extraction.text import TfidfVectorizer from utils import preprocessor tfidf = TfidfVectorizer() classifier = LinearSVC() if __name__ == "__main__": #may need to change the following to your location of sentiments.csv df = pd.read_csv('DATA/sentiments.csv') pipe = make_pipeline(preprocessor(), tfidf, classifier) pipe.fit(df['text'],df['sentiment']) joblib.dump(pipe, open('model.joblib','wb')) newpipe = joblib.load(open('model.joblib','rb')) print('sentiment of "awesome place" is ', newpipe.predict(pd.Series('awesome place'))[0])
import pandas as pd import utils from nltk.corpus import stopwords df = pd.read_csv('./movie_data.csv') print(df.loc[0, 'review'][-50:]) print(utils.preprocessor(df.loc[0, 'review'][-50:])) df.loc['review'] = df.loc['review'].apply(utils.preprocessor) stop = stopwords.words('english')
sql = """ select `id`, `published`, `title`, `description` from zero_day19 where attackType = 'dos' and platform = 'windows' and published BETWEEN '2012-01-01' AND '{}-{}-{}' order by `id`; """.format(str(year), month[id], day[id]) cur.execute(sql) results = cur.fetchall() for row in results: postid = [row[0] for row in results] postdatetime = [row[1] for row in results] threadtitle = [row[2] for row in results] postcontent = [row[3] for row in results] store = [] for i in range(len(postid)): # post = [postid[i], str(postdatetime[i]), utils.preprocessor(str(threadtitle[i] + postcontent[i]))] post = [postid[i], str(postdatetime[i]), utils.preprocessor(str(threadtitle[i]))] store.append(post) # string = "'{0}', '{1}', '{2}'".format(str(post[0]), str(post[1]), str(post[2])) # allopenscedges.write("%s\n" % string) pickle.dump(store,preprocessedposts) preprocessedposts.close() # output pre-processed text into a notepad file (one record per line, comma seperated)