def main(): preImages, preLabels = preproc() images = preImages[:dataSize] labels = preLabels[:dataSize] labels = initLabels(labels) images = np.array(images) images = np.asmatrix(images) W = np.random.rand(numOfLabels, pictureSize) W = np.asmatrix(W) print("Gradient Test: ") GradientTest(images, labels, W) #must be called at the end
def find_data(usr_txt): _from = None _to = None parsed = preproc(usr_txt) locs = [find_loc([], usr_txt, parsed_sent) for parsed_sent in parsed] for sent_locs in locs: if len(sent_locs) > 0: for loc in sent_locs: loc_rel = loc_relation(loc, parsed[0]) if loc_rel == 'from': _from = loc[1] elif loc_rel == 'to': _to = loc[1] return _from, _to, find_date(usr_txt)
def prepare_elmo_query(query, batcher, sentence_character_ids, elmo_sentence_input): """ Gets vector of query :param query: str :param batcher, sentence_character_ids, elmo_sentence_input: ELMo model :return: vector of query """ query = preproc(query) q = [tokenize(query)] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) vector = crop_vec( get_elmo_vectors(sess, q, batcher, sentence_character_ids, elmo_sentence_input)[0], q[0]) return vector
def __init__(self, arg): # deserialise the saved fingerprints of the corpus documents fingerprints = {} with open('fingerprints.pkl', 'rb') as f: fingerprints = pickle.load(f) # open the file and perform preprocessing name = arg f = open(name, "r") sr = f.read() final_doc = preproc(sr) # set the parameters for Rabin-Karp hashing k = 5 base = 10 # set the window for the winnowing algorithm window = 10 # split the cleaned document into k-grams and find their hash-values sr = ''.join(final_doc) hash_val = get_hash(sr, k, base) # select only a few of the hash-values using the winnowing technique fingerprint = winnow(hash_val, window) # find the number of fingerprints that match between the query document and the corpus # documents no_of_matches = {} s1 = set(row[0] for row in fingerprint) f = open('compare1.txt','wt') for k, v in fingerprints.items(): s2 = set(row[0] for row in v) no_of_matches[k] = len(s1.intersection(s2)) sorted_matches = sorted(no_of_matches.items(), key=operator.itemgetter(1),reverse = True) no = 0 for x in sorted_matches: if no < 5 : f.write(x[0] + " : "+ str(x[1]) + "\n") no = no + 1
def main(): testImgs, testLabels = loadTest() preImages, preLabels = preproc() images = preImages[:dataSize] labels = preLabels[:dataSize] labels = initLabels(labels) images = np.array(images) images = np.asmatrix(images) print("preproc done") W = np.random.rand(numOfLabels, pictureSize) # W = np.zeros((numOfLabels,pictureSize)) W = np.asmatrix(W) E = calcE(W, images, labels) print("RANDOM CLASIFYING RESAULTS:") classifyImages(W, testImgs, testLabels) print("----------------------------") print("first E = ", E) for t in range(int(TOTALDATASIZE / dataSize)): print("learning block ", t) images = preImages[t * dataSize:(t + 1) * dataSize] labels = preLabels[t * dataSize:(t + 1) * dataSize] labels = initLabels(labels) images = np.array(images) images = np.asmatrix(images) for k in range(numOfIterations): s = np.random.randint(dataSize, size=(numOfBatches, batchSize)) for j in range(numOfBatches): gradient = calcMBgradient(W, s[j], images, labels) alpha = 0.5 ##can be change to 1/(k+1) W = updateWeights(W, alpha, gradient) if k % printModulo == 0: E = calcE(W, images, labels) print("k= ", k, "new E = ", E) E = calcE(W, images, labels) print("Last E = ", E) print("Done") print("TRANING DATA TEST:") classifyImages(W, preImages[:dataSize], preLabels[:dataSize]) print("TESTING DATA TEST:") classifyImages(W, testImgs, testLabels)
""" [optional - Efficiency] Compare the traning time and test time of the milestone 1, 2, and 3 methods (plus task 4 - if your team did it). Use the average (or mode) runtime over 10 re-runs and perform a suitable statistical test to assess whether one of those performs significantly better than the others w.r.t. efficieny of training and test time. """ #%% 1. Data reading and preprocessing import pandas as pd from preprocessing import preproc, setUsedData # Read data # Remember to set path labeledData = pd.read_csv("../data/kaggle_forest_cover_train.csv") trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon = preproc( labeledData) usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData( 'batch', trainX, trainY, testX, testY) del (trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon) #%% 6. Try Different CLassifiers # Set Parameters maxIter = 1000 tolerance = 1e-3 # Import models import sklearn.linear_model as lm from sklearn.svm import SVC svm = SVC(kernel='rbf') from sklearn.svm import LinearSVC
def main(): doc_vectors = {} with open('doc_vectors.pkl', 'rb') as f: doc_vectors = pickle.load(f) #deserialise the saved terms and the correspoding inverted-index. all_terms = {} with open('all_terms.pkl', 'rb') as f: all_terms = pickle.load(f) #deserialise the saved inverted-document frequency term_df = [] with open('term_df.pkl', 'rb') as f: term_df = pickle.load(f) #deserialise the saved documents mapped to their IDs. with open('doc_id.pkl', 'rb') as f: doc_id = pickle.load(f) #open the file and perform preprocessing name = arg f = open(name, "r") sr = f.read() final_doc = preproc(sr) #get all the terms of the query doc. with their respective frequencies. mp = Counter(final_doc) #initialise the document vector for the query document, with all zeroes q_vector = [0] * len(all_terms) idx = 0 for i in sorted(all_terms.keys()): if i in final_doc: #update the document vector with the frquencies of all the terms q_vector[idx] += mp[i] idx += 1 q_vector = np.log(np.add(1, q_vector)) #lg(1+tf) q_vector = np.multiply(q_vector, term_df) #lg(1+tf)*lg(N/df) q_vector /= np.linalg.norm(q_vector) #normalize the document-vector #perform dot-product multiplication with the document vectors of the corpus documents dot_scores = {} for i in doc_vectors: score = np.dot(q_vector, doc_vectors[i]) dot_scores[i] = score #sort the dictionary of docid : dot_prod according to the dot-product values. sorted_dot_scores = sorted(dot_scores.items(), key=operator.itemgetter(1), reverse=True) #f = open('compare.txt','wt') #no= 0 #print the most-similar-document ranking for i in sorted_dot_scores: docid = i[0] score = i[1] for k, v in doc_id.items(): if v == docid: name = k name = name.split(".")[0] print(name)
import numpy as np import pandas as pd from sklearn.model_selection import KFold from util import gini_xgb from preprocessing import preproc ####################### Data Preprocessing ##################### # Importing the training dataset dataset_train = pd.read_csv('train.csv') # Importing the test dataset dataset_test = pd.read_csv('test.csv') # preprocessing both sets X_train, y_train = preproc(dataset_train, mode="train", oneHot=False) X_test, y_test = preproc(dataset_test, mode="test", oneHot=False) ####################### Training ##################### K = 5 # number of folds kf = KFold(n_splits=K, random_state=42, shuffle=True) # KFold Cross Validation results = [] i = 0 for train_index, test_index in kf.split(X_train): train_X, valid_X = X_train[train_index], X_train[test_index] train_y, valid_y = y_train[train_index], y_train[test_index] weights = np.zeros(len(y_train)) weights[y_train == 0] = 1 weights[y_train == 1] = 1 print(weights, np.mean(weights))
def query_preprocessing(query, model): query_preprocessed = preproc(query) return query_preprocessed
# -*- coding: utf-8 -*- """ [optional - Neural Network] Train and run a Neural Network. Evaluate the predictions using 10-fold cross-validation and a suitable error measure. """ import pandas as pd from preprocessing import preproc, setUsedData dataset = pd.read_csv("../data/kaggle_forest_cover_train.csv") trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon = preproc( dataset) trainBatchSize = 1000 testBatchSize = 300 usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData( 'batch', trainX, trainY, testX, testY, trainBatchSize, testBatchSize) # 2. neural network # http://scikit-learn.org/stable/modules/neural_networks_supervised.html from sklearn.neural_network import MLPClassifier import numpy as np from sklearn.model_selection import ShuffleSplit, cross_val_score left = 2 right = 20 i = 1 acc = np.zeros((right - left + 1), dtype=int) # for node in range(left,right,20): # for layer in range(left,right):
from sklearn.metrics import confusion_matrix from sklearn.model_selection import KFold from util import cross_entropy, gini_normalized from parameters import parameters, batch_size, epochs, layers, activation_functions, loss, alpha from preprocessing import preproc # Part 1 - Data Preprocessing # Importing the train dataset dataset_train = pd.read_csv('train.csv') # Importing the test dataset dataset_test = pd.read_csv('test.csv') # preprocessing train dataset X_train, y_train, scaler = preproc(dataset_train, 'train', oneHot=True, scale=True) # preprocessing test dataset X_test, y_test = preproc(dataset_test, 'test', oneHot=True, scale=True, scaler=scaler) # Part 2 - Now let's make the ANN! # Implement KFold cross validation class_weight = {0: 1., 1: alpha} K = 5 kf = KFold(n_splits=K, random_state=42, shuffle=True) #training with KFold Cross Validation i=0 results = [] for train_index, test_index in kf.split(X_train): # Initialising the ANN classifier = Sequential()
'mnist', 'nist', 'pendigits', 'satimage', 'usps', ] # %% Main Program if __name__ == '__main__': # iterate over each dataset for db in datasets: dataset = loader.db_load(db) dataset = preprocessing.preproc(dataset) # use all algorithm for testing for alg in algorithms: algorithm = algs.get_algorithm(alg, dataset) transformed_db = algorithm.transform(dataset) histories = [] # train neural network with the num of simulation for sim_num in range(simulation_num): algorithm = loader.alg_load(algorithm, dataset) history = compile_fit(algorithm, transformed_db) if not sim_num % 10: algorithm.save_weights(db, sim_num) plot_history(history, alg, db, sim_num) histories.append(history.history)
training["transaction_date"] = training.transaction_date.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0) training["registration_init_time"] = training.registration_init_time.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x) training["registration_init_time"] = training.registration_init_time.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0) testing["membership_expire_date"] = testing.membership_expire_date.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x) testing["membership_expire_date"] = testing.membership_expire_date.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0) testing["transaction_date"] = testing.transaction_date.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x) testing["transaction_date"] = testing.transaction_date.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0) testing["registration_init_time"] = testing.registration_init_time.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x) testing["registration_init_time"] = testing.registration_init_time.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0) print("preprocessing") X_train, y_train = preproc(training, mode='train', oneHot=False) X_test, y_test = preproc(testing, mode="test", oneHot=False) # parameters params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'binary_logloss'}, 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0 }
# exporting facebook comments with https://socialfy.pw/facebook-export-comments # place csv-file in original inputs # eclipse test import time import preprocessing as pre #import getbands as gb startTime = time.time() pre.preproc() pre.getbands() endTime = time.time() print("Finished in " + str('{:.3f}'.format(endTime - startTime)) + " seconds.")
def flight(usr_txt, _from, _to, _date, tries): def find_data(usr_txt): _from = None _to = None parsed = preproc(usr_txt) locs = [find_loc([], usr_txt, parsed_sent) for parsed_sent in parsed] for sent_locs in locs: if len(sent_locs) > 0: for loc in sent_locs: loc_rel = loc_relation(loc, parsed[0]) if loc_rel == 'from': _from = loc[1] elif loc_rel == 'to': _to = loc[1] return _from, _to, find_date(usr_txt) if not _from and not _to and not _date: if tries > 3: return "Sorry, I couldn't understand." else: _from, _to, _date = find_data(usr_txt) if not _from: respond("Where are you flying from?") usr_txt = raw_input(usr) parsed = preproc(usr_txt) if len(parsed[0]) <= 3 and isloc(usr_txt, 0, parsed[0], parsed[0][0][0], parsed[0][0][1]): _from = usr_txt else: if not _date: _from, _to, _date = find_data(usr_txt) else: _from, _to, _ = find_data(usr_txt) return flight(usr_txt, _from, _to, _date, tries + 1) elif not _to: respond("Where are you flying to?") usr_txt = raw_input(usr) parsed = preproc(usr_txt) if len(parsed) <= 3 and isloc(usr_txt, 0, parsed[0], parsed[0][0][0], parsed[0][0][1]): _to = usr_txt else: if not _date: _from, _to, _date = find_data(usr_txt) else: _from, _to, _ = find_data(usr_txt) return flight(usr_txt, _from, _to, _date, tries + 1) elif _from.lower() == _to.lower(): respond("The departure and destination cities can't be the same.") usr_txt = raw_input(usr) if not _date: _from, _to, _date = find_data(usr_txt) else: _from, _to, _ = find_data(usr_txt) return flight(usr_txt, _from, _to, _date, tries + 1) elif not _date: respond("When do you want to go?") usr_txt = raw_input(usr) _date = find_date(usr_txt) return flight(usr_txt, _from, _to, _date, tries + 1) elif not later_date(_date): respond("Sorry, the date has to be in the future.") usr_txt = raw_input(usr) _date = find_date(usr_txt) return flight(usr_txt, _from, _to, _date, tries + 1) return "OK, looking for a %s-%s flight on %s." % (_from, _to, _date)
if isloc(raw, ind, parsed, lemma, pos): name = NER_recursion(ind, parsed, lemma, pos) if locs != []: locs.append(( locs[-1][0] + len(locs[-1][1].split()) + ind, name)) else: locs.append((ind, name)) return find_loc(locs, raw, parsed[ind+len(name.split()):]) return locs def loc_relation(loc, parsed): prev_w = '' next_w = '' if loc[0] >= 0: prev_w = parsed[loc[0]-1][0] if loc[0]+1 != len(parsed): next_w = parsed[loc[0]+1][0] if prev_w in ['from', 'leave', 'leaving'] or next_w == '-': return 'from' elif prev_w in ['to', 'into', 'towards', '-']: return 'to' if __name__ == '__main__': # Test your stuff. s1 = 'I am flying from Lviv to New York' parsed = preproc(s1) locs = [find_loc([], s1, parsed_sent) for parsed_sent in parsed]