예제 #1
0
def build_model():
	model = NLPModel()
	
	with open('lib/data/train.tsv') as f:
		data = pd.read_csv(f, sep='\t')
		
	pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)]
	pos_neg['Binary'] = pos_neg.apply(lambda x: 0 if x['Sentiment'] == 0 else 1, axis=1)
		
	model.vectorizer_fit(pos_neg.loc[:, 'Phrase'])
	print('Vectorizer fit complete')
	
	X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase'])
	print('Vectorizer tranform complete')
	y = pos_neg.loc[:, 'Binaryy']
	
	X_train, X_test, y_train, y_test = train_test_split(X, y)
	
	model.train(X_train, y_train)
	Print('Model training complete')
	
	model.pickle_clf()
	model.pickle_vectorizer()
	
	model.plot_roc(X_train, y_test)
예제 #2
0
def build_model():
    model = NLPModel()

    # filename = os.path.join(
    #     os.path.dirname(__file__), 'chalicelib', 'all/train.tsv')
    with open('lib/data/train.tsv') as f:
        data = pd.read_csv(f, sep='\t')

    pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)]

    pos_neg['Binary'] = pos_neg.apply(lambda x: 0
                                      if x['Sentiment'] == 0 else 1,
                                      axis=1)

    model.vectorizer_fit(pos_neg.loc[:, 'Phrase'])
    print('Vectorizer fit complete')

    X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase'])
    print('Vectorizer transform complete')
    y = pos_neg.loc[:, 'Binary']

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model.train(X_train, y_train)
    print('Model training complete')

    model.pickle_clf()
    model.pickle_vectorizer()

    model.plot_roc(X_test, y_test, size_x=12, size_y=12)
예제 #3
0
def build_model():
    # builds sentiment classifier and vectorizer
    model = NLPModel()
    train_data_dir = 'lib/data/train.tsv'
    with open(train_data_dir) as f:
        data = pd.read_csv(f, sep='\t')

    pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)]
    pos_neg['Binary'] = pos_neg.apply(
        lambda x: 0 if x['Sentiment'] == 0 else 1, axis=1)

    model.vectorizer_fit(pos_neg.loc[:, 'Phrase'])
    X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase'])
    print('Vectorizer transform complete')

    y = pos_neg.loc[:, 'Binary']
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    model.train(X_train, y_train)

    model.pickle_clf()
    model.pickle_vectorizer()
    print('Sentiment Classifier Built')

    # builds diamond price predictor
    model_two = DiamondPredictor() 
    df = sns.load_dataset('diamonds')
    train, test = train_test_split(df.copy(), random_state=0)
    cut_ranks = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
    train.cut = train.cut.map(cut_ranks)
    test.cut = test.cut.map(cut_ranks)
    features = ['carat', 'cut']
    target = 'price'
    model_two.train(train[features], train[target])
    model_two.pickle_model()
    print('Diamond Regressor Built')
예제 #4
0
def build_model():
    model = NLPModel()

    #unzip the dataFiles in the folder where this file is saved before executing the below statements
    df_extract_combined = pd.read_csv('extract_combined.csv')
    df_labels = pd.read_csv('labels.csv')

    df_final = pd.merge(df_extract_combined, df_labels, on='document_name')
    df_text_data = df_final[['text', 'is_fitara']]

    for i in range(len(df_text_data)):
        df_text_data['text'][i] = re.sub('[^a-zA-Z]', ' ',
                                         df_text_data['text'][i])

    df_text_data['text'] = df_text_data['text'].apply(applyLemmatizer)

    #df_text_data['text'] = df_text_data['text'].apply(stopwords)

    le = LabelEncoder()
    df_text_data['is_fitara'] = le.fit_transform(df_text_data['is_fitara'])

    model.vectorizer_fit(df_text_data.loc[:, 'text'])
    #print('Vectorizer fit complete')

    X = model.vectorizer_transform(df_text_data.loc[:, 'text'])
    #print('Vectorizer transform complete')
    y = df_text_data.loc[:, 'is_fitara']

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model.train(X_train, y_train)
    #print('Model training complete')

    model.pickle_clf()
    model.pickle_vectorizer()
예제 #5
0
def build_model():
    model = NLPModel()

    data = pd.read_csv('extract_combined.csv')
    data2 = pd.read_csv('labels.csv', error_bad_lines=False)
    merged = pd.merge(data, data2)
    yn = {'Yes': 1, 'No': 0}

    merged.is_fitara = [yn[i] for i in merged.is_fitara]

    model.vectorizer_fit(data.loc[:, 'text'])
    print('Vectorizer fit complete')

    X = model.vectorizer_transform(data.loc[:, 'text'])
    print('Vectorizer transform complete')
    y = merged.loc[:, 'is_fitara']

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model.train(X_train, y_train)
    print('Model training complete')

    model.pickle_clf()
    model.pickle_vectorizer()
예제 #6
0
def build_model():
    model = NLPModel()
    with open('./data/train.tsv') as f:
        data = pd.read_csv(f, sep='\t')

    print(data.columns)
    pos_neg = data[(data['Sentiment'] == 0) | (data['Sentiment'] == 4)]

    pos_neg['Binary'] = np.where(pos_neg['Sentiment'] == 0, 0, 1)

    model.vectorizer_fit(pos_neg.loc[:, 'Phrase'])
    print('Vectorizer fit complete')

    X = model.vectorizer_transform(pos_neg.loc[:, 'Phrase'])
    print('Vectorizer transform complete')
    y = pos_neg.loc[:, 'Binary']

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model.train(X_train, y_train)
    print('Model training complete')

    model.pickle_clf()
    model.pickle_vectorizer()