def LR(): # Read training data train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile('train.csv') ''' Implement your Logistic Regression classifier here ''' BOW = True GLOVE = False word2index = get_vocab(train_tweet_id2text) data_dict = combine_vec(word2index,train_tweet_id2text,train_tweet_id2author_label,train_tweet_id2issue,bow=True) data = [] labels = [] for k in data_dict: data.append(data_dict[k]) labels.append(int(train_tweet_id2label[k])) n_class = len(set(labels)) print('dataset shape:',np.shape(data)) # print(np.shape(labels)) # print(len(word2index)) n_sample,n_feature = np.shape(data) model = LogRegression(n_feature,n_class,lrate=0.8,verbose=True) model.fit(data,labels,max_iter=500) # y_pred = [model.predict(x) for x in data] # Read test data test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label = ReadFile('test.csv') # Predict test data by learned model ''' Replace the following random predictor by your prediction function. ''' test_data_dict = combine_vec(word2index,test_tweet_id2text,test_tweet_id2author_label,test_tweet_id2issue,bow = True) for tweet_id in test_tweet_id2text: # Get the text # text=test_tweet_id2text[tweet_id] # Predict the label test_x = test_data_dict[tweet_id] label = model.predict(test_x) # Store it in the dictionary test_tweet_id2label[tweet_id] = label # Save predicted labels in 'test_lr.csv' SaveFile(test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label, 'test_lr.csv')
def NN(): # Read training data train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile( 'train.csv') ''' Implement your Neural Network classifier here ''' word2index = get_vocab(train_tweet_id2text) data_dict = combine_vec(word2index, train_tweet_id2text, train_tweet_id2author_label, train_tweet_id2issue, bow=False) data = [] labels = [] for k in data_dict: data.append(data_dict[k]) labels.append(int(train_tweet_id2label[k])) data = np.array(data) labels = np.array(labels) n_class = len(set(labels)) n_sample, n_feature = np.shape(data) print(np.shape(data)) lrates = [0.2, 0.4, 0.9] all_loss = [] for r in lrates: model = LogRegression(n_feature, n_class, lrate=r, verbose=True) train_loss = model.fit(data, labels, max_iter=500) print(len(train_loss)) all_loss.append(train_loss) file_name = 'train_loss_nn.pdf' plot_lr(lrates, all_loss, file_name)
def NN(): # Read training data train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile('train.csv') ''' Implement your Neural Network classifier here ''' word2index = get_vocab(train_tweet_id2text) data_dict = combine_vec(word2index,train_tweet_id2text,train_tweet_id2author_label,train_tweet_id2issue, bow = False) data = [] labels = [] for k in data_dict: data.append(data_dict[k]) labels.append(int(train_tweet_id2label[k])) data = np.array(data) labels = np.array(labels) n_class = len(set(labels)) n_sample,n_feature = np.shape(data) print(np.shape(data)) model = NeuralNet(n_feature,n_class,lrate=0.9,verbose=True) model.fit(data,labels,max_iter=800) # Read test data test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label = ReadFile('test.csv') ''' Replace the following random predictor by your prediction function. ''' test_data_dict = combine_vec(word2index,test_tweet_id2text,test_tweet_id2author_label,test_tweet_id2issue,bow = False) for tweet_id in test_tweet_id2text: # Predict the label test_x = test_data_dict[tweet_id] label = model.predict(test_x) # Store it in the dictionary test_tweet_id2label[tweet_id] = label # Save predicted labels in 'test_lr.csv' SaveFile(test_tweet_id2text, test_tweet_id2issue, test_tweet_id2author_label, test_tweet_id2label, 'test_nn.csv')
def cv_NN(kfold): # Read training data train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile( 'train.csv') ''' Implement your Neural Network classifier here ''' word2index = get_vocab(train_tweet_id2text) data_dict = combine_vec(word2index, train_tweet_id2text, train_tweet_id2author_label, train_tweet_id2issue, bow=False) data = [] labels = [] for k in data_dict: data.append(data_dict[k]) labels.append(int(train_tweet_id2label[k])) data = np.array(data) labels = np.array(labels) n_class = len(set(labels)) n_sample, n_feature = np.shape(data) print('Cross validation for Neural network') n_sample, n_feature = np.shape(data) fold_size = int(np.ceil(n_sample / kfold)) print('Fold size:', fold_size) accuracy = [] for k in range(kfold): tstart = k * fold_size tend = min(n_sample, tstart + fold_size) training_x = np.array( [x for i, x in enumerate(data) if not (tstart <= i and i < tend)]) test_x = np.array( [x for i, x in enumerate(data) if (tstart <= i and i < tend)]) training_y = np.array([ x for i, x in enumerate(labels) if not (tstart <= i and i < tend) ]) test_y = np.array( [x for i, x in enumerate(labels) if (tstart <= i and i < tend)]) model = NeuralNet(n_feature, n_class, lrate=0.9, verbose=False) model.fit(training_x, training_y, max_iter=500) accuracy.append(model.score(test_x, test_y)) print('Fold', k, 'accuracy', accuracy[-1]) print('Mean accuracy', np.mean(accuracy))
def LR(): # Read training data train_tweet_id2text, train_tweet_id2issue, train_tweet_id2author_label, train_tweet_id2label = ReadFile( 'train.csv') ''' Implement your Logistic Regression classifier here ''' BOW = True GLOVE = False word2index = get_vocab(train_tweet_id2text) data_dict = combine_vec(word2index, train_tweet_id2text, train_tweet_id2author_label, train_tweet_id2issue, bow=True) data = [] labels = [] for k in data_dict: data.append(data_dict[k]) labels.append(int(train_tweet_id2label[k])) n_class = len(set(labels)) print(np.shape(data)) # print(np.shape(labels)) # print(len(word2index)) n_sample, n_feature = np.shape(data) lrates = [0.2, 0.5, 0.8] all_loss = [] for r in lrates: model = LogRegression(n_feature, n_class, lrate=r, verbose=True) train_loss = model.fit(data, labels, max_iter=200) print(len(train_loss)) all_loss.append(train_loss) file_name = 'train_loss_lr.pdf' plot_lr(lrates, all_loss, file_name)