예제 #1
0
def main():

    preImages, preLabels = preproc()
    images = preImages[:dataSize]
    labels = preLabels[:dataSize]
    labels = initLabels(labels)
    images = np.array(images)
    images = np.asmatrix(images)
    W = np.random.rand(numOfLabels, pictureSize)
    W = np.asmatrix(W)

    print("Gradient Test: ")
    GradientTest(images, labels, W)  #must be called at the end
예제 #2
0
 def find_data(usr_txt):
     _from = None
     _to = None
     parsed = preproc(usr_txt)
     locs = [find_loc([], usr_txt, parsed_sent) for parsed_sent in parsed]
     for sent_locs in locs:
         if len(sent_locs) > 0:
             for loc in sent_locs:
                 loc_rel = loc_relation(loc, parsed[0])
                 if loc_rel == 'from':
                     _from = loc[1]
                 elif loc_rel == 'to':
                     _to = loc[1]
     return _from, _to, find_date(usr_txt)
예제 #3
0
def prepare_elmo_query(query, batcher, sentence_character_ids,
                       elmo_sentence_input):
    """ 
    Gets vector of query

    :param query: str
    :param batcher, sentence_character_ids, elmo_sentence_input: ELMo model
    
    :return: vector of query
    """
    query = preproc(query)
    q = [tokenize(query)]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        vector = crop_vec(
            get_elmo_vectors(sess, q, batcher, sentence_character_ids,
                             elmo_sentence_input)[0], q[0])
    return vector
예제 #4
0
	def __init__(self, arg):
		
		# deserialise the saved fingerprints of the corpus documents
		fingerprints = {}
		with open('fingerprints.pkl', 'rb') as f:
		    fingerprints = pickle.load(f)

		# open the file and perform preprocessing
		name = arg
		f = open(name, "r")
		sr = f.read()
		final_doc = preproc(sr)

		# set the parameters for Rabin-Karp hashing
		k = 5
		base = 10

		# set the window for the winnowing algorithm
		window = 10

		# split the cleaned document into k-grams and find their hash-values
		sr = ''.join(final_doc)
		hash_val = get_hash(sr, k, base)

		# select only a few of the hash-values using the winnowing technique
		fingerprint = winnow(hash_val, window)

		# find the number of fingerprints that match between the query document and the corpus
		# documents
		no_of_matches = {}
		s1 = set(row[0] for row in fingerprint)
		f = open('compare1.txt','wt')
		for k, v in fingerprints.items():
		    s2 = set(row[0] for row in v)
		    no_of_matches[k] = len(s1.intersection(s2))
		
		sorted_matches = sorted(no_of_matches.items(), key=operator.itemgetter(1),reverse = True)

		
		no = 0
		for x in sorted_matches:	
			if no < 5 :
				f.write(x[0] + " : "+ str(x[1]) + "\n")
				no = no + 1
예제 #5
0
파일: main.py 프로젝트: rotembeh/SGD
def main():
    testImgs, testLabels = loadTest()
    preImages, preLabels = preproc()
    images = preImages[:dataSize]
    labels = preLabels[:dataSize]
    labels = initLabels(labels)
    images = np.array(images)
    images = np.asmatrix(images)
    print("preproc done")
    W = np.random.rand(numOfLabels, pictureSize)
    #  W = np.zeros((numOfLabels,pictureSize))
    W = np.asmatrix(W)
    E = calcE(W, images, labels)
    print("RANDOM CLASIFYING RESAULTS:")
    classifyImages(W, testImgs, testLabels)
    print("----------------------------")
    print("first E = ", E)

    for t in range(int(TOTALDATASIZE / dataSize)):
        print("learning block ", t)
        images = preImages[t * dataSize:(t + 1) * dataSize]
        labels = preLabels[t * dataSize:(t + 1) * dataSize]
        labels = initLabels(labels)
        images = np.array(images)
        images = np.asmatrix(images)
        for k in range(numOfIterations):
            s = np.random.randint(dataSize, size=(numOfBatches, batchSize))
            for j in range(numOfBatches):
                gradient = calcMBgradient(W, s[j], images, labels)
                alpha = 0.5  ##can be change to 1/(k+1)
                W = updateWeights(W, alpha, gradient)
            if k % printModulo == 0:
                E = calcE(W, images, labels)
                print("k= ", k, "new E = ", E)

    E = calcE(W, images, labels)
    print("Last E = ", E)
    print("Done")
    print("TRANING DATA TEST:")
    classifyImages(W, preImages[:dataSize], preLabels[:dataSize])
    print("TESTING DATA TEST:")
    classifyImages(W, testImgs, testLabels)
예제 #6
0
"""
[optional - Efficiency] 
Compare the traning time and test time of the milestone 1, 2, and 3 methods 
    (plus task 4 - if your team did it). 
Use the average (or mode) runtime over 10 re-runs 
    and perform a suitable statistical test to assess whether one of those performs significantly better 
    than the others w.r.t. efficieny of training and test time. 
"""

#%% 1. Data reading and preprocessing
import pandas as pd
from preprocessing import preproc, setUsedData
# Read data
# Remember to set path
labeledData = pd.read_csv("../data/kaggle_forest_cover_train.csv")
trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon = preproc(
    labeledData)
usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData(
    'batch', trainX, trainY, testX, testY)

del (trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon)

#%% 6. Try Different CLassifiers
# Set Parameters
maxIter = 1000
tolerance = 1e-3

# Import models
import sklearn.linear_model as lm
from sklearn.svm import SVC
svm = SVC(kernel='rbf')
from sklearn.svm import LinearSVC
예제 #7
0
def main():
    doc_vectors = {}
    with open('doc_vectors.pkl', 'rb') as f:
        doc_vectors = pickle.load(f)

    #deserialise the saved terms and the correspoding inverted-index.
    all_terms = {}
    with open('all_terms.pkl', 'rb') as f:
        all_terms = pickle.load(f)

    #deserialise the saved inverted-document frequency
    term_df = []
    with open('term_df.pkl', 'rb') as f:
        term_df = pickle.load(f)

    #deserialise the saved documents mapped to their IDs.
    with open('doc_id.pkl', 'rb') as f:
        doc_id = pickle.load(f)

    #open the file and perform preprocessing
    name = arg
    f = open(name, "r")
    sr = f.read()
    final_doc = preproc(sr)

    #get all the terms of the query doc. with their respective frequencies.
    mp = Counter(final_doc)

    #initialise the document vector for the query document, with all zeroes
    q_vector = [0] * len(all_terms)
    idx = 0
    for i in sorted(all_terms.keys()):
        if i in final_doc:
            #update the document vector with the frquencies of all the terms
            q_vector[idx] += mp[i]
        idx += 1

    q_vector = np.log(np.add(1, q_vector))  #lg(1+tf)
    q_vector = np.multiply(q_vector, term_df)  #lg(1+tf)*lg(N/df)
    q_vector /= np.linalg.norm(q_vector)  #normalize the document-vector

    #perform dot-product multiplication with the document vectors of the corpus documents
    dot_scores = {}
    for i in doc_vectors:
        score = np.dot(q_vector, doc_vectors[i])
        dot_scores[i] = score

    #sort the dictionary of docid : dot_prod according to the dot-product values.
    sorted_dot_scores = sorted(dot_scores.items(),
                               key=operator.itemgetter(1),
                               reverse=True)

    #f = open('compare.txt','wt')
    #no= 0
    #print the most-similar-document ranking
    for i in sorted_dot_scores:
        docid = i[0]
        score = i[1]
        for k, v in doc_id.items():
            if v == docid:
                name = k
        name = name.split(".")[0]
    print(name)
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

from util import gini_xgb
from preprocessing import preproc

####################### Data Preprocessing #####################
# Importing the training dataset
dataset_train = pd.read_csv('train.csv')

# Importing the test dataset
dataset_test = pd.read_csv('test.csv')

# preprocessing both sets
X_train, y_train = preproc(dataset_train, mode="train", oneHot=False)
X_test, y_test = preproc(dataset_test, mode="test", oneHot=False)

####################### Training #####################
K = 5  # number of folds
kf = KFold(n_splits=K, random_state=42, shuffle=True)
# KFold Cross Validation
results = []
i = 0
for train_index, test_index in kf.split(X_train):
    train_X, valid_X = X_train[train_index], X_train[test_index]
    train_y, valid_y = y_train[train_index], y_train[test_index]
    weights = np.zeros(len(y_train))
    weights[y_train == 0] = 1
    weights[y_train == 1] = 1
    print(weights, np.mean(weights))
def query_preprocessing(query, model):
    query_preprocessed = preproc(query)
    return query_preprocessed
예제 #10
0
# -*- coding: utf-8 -*-
"""
[optional - Neural Network] Train and run a Neural Network.
Evaluate the predictions using 10-fold cross-validation and a suitable error measure.
"""

import pandas as pd
from preprocessing import preproc, setUsedData

dataset = pd.read_csv("../data/kaggle_forest_cover_train.csv")
trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon = preproc(
    dataset)
trainBatchSize = 1000
testBatchSize = 300

usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData(
    'batch', trainX, trainY, testX, testY, trainBatchSize, testBatchSize)

# 2. neural network
# http://scikit-learn.org/stable/modules/neural_networks_supervised.html

from sklearn.neural_network import MLPClassifier
import numpy as np
from sklearn.model_selection import ShuffleSplit, cross_val_score

left = 2
right = 20
i = 1
acc = np.zeros((right - left + 1), dtype=int)
# for node in range(left,right,20):
# for layer in range(left,right):
예제 #11
0
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

from util import cross_entropy, gini_normalized
from parameters import parameters, batch_size, epochs, layers, activation_functions, loss, alpha
from preprocessing import preproc

# Part 1 - Data Preprocessing
# Importing the train dataset
dataset_train = pd.read_csv('train.csv')

# Importing the test dataset
dataset_test = pd.read_csv('test.csv')

# preprocessing train dataset
X_train, y_train, scaler = preproc(dataset_train, 'train', oneHot=True, scale=True)

# preprocessing test dataset
X_test, y_test = preproc(dataset_test, 'test', oneHot=True, scale=True, scaler=scaler)

# Part 2 - Now let's make the ANN!
# Implement KFold cross validation
class_weight = {0: 1., 1: alpha}
K = 5
kf = KFold(n_splits=K, random_state=42, shuffle=True)
#training with KFold Cross Validation
i=0
results = []
for train_index, test_index in kf.split(X_train):
# Initialising the ANN
    classifier = Sequential()
예제 #12
0
파일: main.py 프로젝트: vakili73/CodeV1
    'mnist',
    'nist',
    'pendigits',
    'satimage',
    'usps',
]

# %% Main Program


if __name__ == '__main__':

    # iterate over each dataset
    for db in datasets:
        dataset = loader.db_load(db)
        dataset = preprocessing.preproc(dataset)

        # use all algorithm for testing
        for alg in algorithms:
            algorithm = algs.get_algorithm(alg, dataset)
            transformed_db = algorithm.transform(dataset)

            histories = []
            # train neural network with the num of simulation
            for sim_num in range(simulation_num):
                algorithm = loader.alg_load(algorithm, dataset)
                history = compile_fit(algorithm, transformed_db)
                if not sim_num % 10:
                    algorithm.save_weights(db, sim_num)
                    plot_history(history, alg, db, sim_num)
                histories.append(history.history)
예제 #13
0
training["transaction_date"] = training.transaction_date.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0)

training["registration_init_time"] = training.registration_init_time.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x)
training["registration_init_time"] = training.registration_init_time.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0)

testing["membership_expire_date"] = testing.membership_expire_date.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x)
testing["membership_expire_date"] = testing.membership_expire_date.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0)

testing["transaction_date"] = testing.transaction_date.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x)
testing["transaction_date"] = testing.transaction_date.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0)

testing["registration_init_time"] = testing.registration_init_time.apply(lambda x: datetime.strptime(str(x), "%Y-%m-%d").date() if pd.notnull(x) else x)
testing["registration_init_time"] = testing.registration_init_time.apply(lambda x: time.mktime(x.timetuple()) if pd.notnull(x) else 0.0)

print("preprocessing")
X_train, y_train = preproc(training, mode='train', oneHot=False)
X_test, y_test = preproc(testing, mode="test", oneHot=False)

# parameters
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'binary_logloss'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
예제 #14
0
# exporting facebook comments with https://socialfy.pw/facebook-export-comments
# place csv-file in original inputs
# eclipse test

import time
import preprocessing as pre
#import getbands as gb

startTime = time.time()

pre.preproc()
pre.getbands()

endTime = time.time()
print("Finished in " + str('{:.3f}'.format(endTime - startTime)) + " seconds.")
예제 #15
0
def flight(usr_txt, _from, _to, _date, tries):
    def find_data(usr_txt):
        _from = None
        _to = None
        parsed = preproc(usr_txt)
        locs = [find_loc([], usr_txt, parsed_sent) for parsed_sent in parsed]
        for sent_locs in locs:
            if len(sent_locs) > 0:
                for loc in sent_locs:
                    loc_rel = loc_relation(loc, parsed[0])
                    if loc_rel == 'from':
                        _from = loc[1]
                    elif loc_rel == 'to':
                        _to = loc[1]
        return _from, _to, find_date(usr_txt)

    if not _from and not _to and not _date:
        if tries > 3:
            return "Sorry, I couldn't understand."
        else:
            _from, _to, _date = find_data(usr_txt)

    if not _from:
        respond("Where are you flying from?")
        usr_txt = raw_input(usr)
        parsed = preproc(usr_txt)

        if len(parsed[0]) <= 3 and isloc(usr_txt, 0, parsed[0],
                                         parsed[0][0][0], parsed[0][0][1]):
            _from = usr_txt

        else:
            if not _date:
                _from, _to, _date = find_data(usr_txt)
            else:
                _from, _to, _ = find_data(usr_txt)

        return flight(usr_txt, _from, _to, _date, tries + 1)

    elif not _to:
        respond("Where are you flying to?")
        usr_txt = raw_input(usr)
        parsed = preproc(usr_txt)

        if len(parsed) <= 3 and isloc(usr_txt, 0, parsed[0], parsed[0][0][0],
                                      parsed[0][0][1]):
            _to = usr_txt

        else:
            if not _date:
                _from, _to, _date = find_data(usr_txt)
            else:
                _from, _to, _ = find_data(usr_txt)

        return flight(usr_txt, _from, _to, _date, tries + 1)

    elif _from.lower() == _to.lower():
        respond("The departure and destination cities can't be the same.")
        usr_txt = raw_input(usr)

        if not _date:
            _from, _to, _date = find_data(usr_txt)
        else:
            _from, _to, _ = find_data(usr_txt)

        return flight(usr_txt, _from, _to, _date, tries + 1)

    elif not _date:
        respond("When do you want to go?")
        usr_txt = raw_input(usr)
        _date = find_date(usr_txt)
        return flight(usr_txt, _from, _to, _date, tries + 1)

    elif not later_date(_date):
        respond("Sorry, the date has to be in the future.")
        usr_txt = raw_input(usr)
        _date = find_date(usr_txt)
        return flight(usr_txt, _from, _to, _date, tries + 1)

    return "OK, looking for a %s-%s flight on %s." % (_from, _to, _date)
예제 #16
0
        if isloc(raw, ind, parsed, lemma, pos):
            name = NER_recursion(ind, parsed, lemma, pos)
            if locs != []:
                locs.append(( locs[-1][0] + len(locs[-1][1].split()) + ind, name))
            else:
                locs.append((ind, name))
            return find_loc(locs, raw, parsed[ind+len(name.split()):])
    return locs


def loc_relation(loc, parsed):
    prev_w = ''
    next_w = ''
    if loc[0] >= 0:
        prev_w = parsed[loc[0]-1][0]
    if loc[0]+1 != len(parsed):
        next_w = parsed[loc[0]+1][0]

    if prev_w in ['from', 'leave', 'leaving'] or next_w == '-':
        return 'from'
    elif prev_w in ['to', 'into', 'towards', '-']:
        return 'to'


if __name__ == '__main__':
    # Test your stuff.

    s1 = 'I am flying from Lviv to New York'
    parsed = preproc(s1)
    locs = [find_loc([], s1, parsed_sent) for parsed_sent in parsed]