def build_model(): model = NLPModel() #unzip the dataFiles in the folder where this file is saved before executing the below statements df_extract_combined = pd.read_csv('extract_combined.csv') df_labels = pd.read_csv('labels.csv') df_final = pd.merge(df_extract_combined, df_labels, on='document_name') df_text_data = df_final[['text', 'is_fitara']] for i in range(len(df_text_data)): df_text_data['text'][i] = re.sub('[^a-zA-Z]', ' ', df_text_data['text'][i]) df_text_data['text'] = df_text_data['text'].apply(applyLemmatizer) #df_text_data['text'] = df_text_data['text'].apply(stopwords) le = LabelEncoder() df_text_data['is_fitara'] = le.fit_transform(df_text_data['is_fitara']) model.vectorizer_fit(df_text_data.loc[:, 'text']) #print('Vectorizer fit complete') X = model.vectorizer_transform(df_text_data.loc[:, 'text']) #print('Vectorizer transform complete') y = df_text_data.loc[:, 'is_fitara'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) #print('Model training complete') model.pickle_clf() model.pickle_vectorizer()
def build_model(): model = NLPModel() # filename = os.path.join( # os.path.dirname(__file__), 'chalicelib', 'all/train.tsv') with open('../sentiment_data/train.tsv') as f: data = pd.read_csv(f, sep='\t') pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)] pos_neg['Binary'] = pos_neg.apply(lambda x: 0 if x['Sentiment'] == 0 else 1, axis=1) model.vectorizer_fit(pos_neg.loc[:, 'Phrase']) print('Vectorizer fit complete') X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase']) print('Vectorizer transform complete') y = pos_neg.loc[:, 'Binary'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) print('Model training complete') model.pickle_clf() model.pickle_vectorizer() model.plot_roc(X_test, y_test)
def build_model(): model = NLPModel() data = pd.read_csv('extract_combined.csv') data2 = pd.read_csv('labels.csv', error_bad_lines=False) merged = pd.merge(data, data2) yn = {'Yes': 1, 'No': 0} merged.is_fitara = [yn[i] for i in merged.is_fitara] model.vectorizer_fit(data.loc[:, 'text']) print('Vectorizer fit complete') X = model.vectorizer_transform(data.loc[:, 'text']) print('Vectorizer transform complete') y = merged.loc[:, 'is_fitara'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) print('Model training complete') model.pickle_clf() model.pickle_vectorizer()
def build_model(): model = NLPModel() with open('./data/train.tsv') as f: data = pd.read_csv(f, sep='\t') print(data.columns) pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)] pos_neg['Binary'] = np.where(pos_neg['Sentiment'] == 0, 0, 1) model.vectorizer_fit(pos_neg.loc[:, 'Phrase']) print('Vectorizer fit complete') X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase']) print('Vectorizer transform complete') y = pos_neg.loc[:, 'Binary'] X_train, X_test, y_train, y_test = train_test_split(X, y) model.train(X_train, y_train) print('Model training complete') model.pickle_clf() model.pickle_vectorizer()