예제 #1
0
파일: run.py 프로젝트: anubrata/fc-aaai18
def get_processed_data():
    import features
    (train_data, X_train, val_data, X_val, test_data, X_test) = features.get_data()
    all_data = pd.concat( [train_data, val_data, test_data], ignore_index = True)
    data = data_pp.process_data(all_data)
    clean_body_labels(data)
    
    train_data_pp = data[: len(train_data)]
    val_data_pp = data[len(train_data): len(train_data) + len(val_data)]
    test_data_pp = data[len(train_data) + len(val_data): ]
    
    return (train_data_pp, X_train, val_data_pp, X_val, test_data_pp, X_test)
예제 #2
0
def predict_class(img, model):
    # display_image(img, " Prediction Module")
    # Load ANN
    with open('MLPClassifier.pkl', 'rb') as f:
        clf1 = pickle.load(f)

    with open('scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)

    #  Load CNN
    json_file = open('CNNmodelFinal.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(
        "F:\CODING\ProjectLatex\draft\models\.014-0.783.hdf5")
    loaded_model.compile(loss='categorical_crossentropy',
                         optimizer='adadelta',
                         metrics=['accuracy'])

    if model == "cnn":
        img = cv2.resize(img, (128, 128))
        # img = pre.filter_image(img)
        # img = pre.otsu_thresh(img)
        # print(img)
        immatrix = []
        # img_arr = array(np.asarray(img)).flatten()
        immatrix.append(img)
        inp = np.asarray(immatrix)
        Output = proba.prob(img)
        inp = inp.reshape(inp.shape[0], 128, 128, 1)
        inp = inp.astype('float32')
        inp /= 255
        # print(inp)
        output = loaded_model.predict_classes(inp)
        # print(output)
        z = mp.list[int(output[0])]
        # output = proba(Output, z)
        return Output
    else:
        x = fea.get_data(img)
        temp = []
        temp.append(x)
        temp = scaler.transform(temp)
        y = clf1.predict(temp)
        y = mp.list[int(y[0])]
        if y in mp.small:
            y = y.lower()
        # print(y + ' Got Predicted')
        return y
예제 #3
0
for files in os.listdir(path):
    real_value = pos
    print(files)
    print(real_value)
    if pos < 95:
        pos = pos + 1
        continue
    path2 = os.listdir(path + files + '/')
    y = (len(path2))
    cnt = 0
    boudary = (y * 90) / 100
    for imges in path2:
        val = path + files + '/' + imges
        img = cv2.imread(val, 0)
        cnt = cnt + 1
        new_list = get_data(img)
        if new_list == -1:
            continue
        # print (new_list)
        if cnt > boudary:
            for i in new_list:
                test_in.write(str(i))
                test_in.write(" ")
            test_in.write('\n')
            test_out.write(str(real_value))
            test_out.write('\n')
        else:
            for i in new_list:
                train_in.write(str(i))
                train_in.write(" ")
            train_in.write('\n')
예제 #4
0
파일: checking.py 프로젝트: Sarthak2119/OCR
def image_to_text(str):
    img = cv2.imread(str, 0)
    # img = cv2.resize(img, (50, 50), interpolation=cv2.INTER_CUBIC)

    with open('KNNClassifier.pkl', 'rb') as f:
        clf2 = pickle.load(f)
    with open('ExtraTreesClassifier.pkl', 'rb') as f:
        clf1 = pickle.load(f)
    with open('MPLClassifier4.pkl', 'rb') as f:
        clf3 = pickle.load(f)
    with open('scaler4.pkl', 'rb') as f:
        scaler = pickle.load(f)
    list_chars = template.run(img)

    # print(len(list_chars))
    # print(len(list_chars[0]))

    ret_str = ""
    for word_list in list_chars:
        chars = []
        for char_img in word_list:
            datas = get_data(char_img)
            # util.display_image(char_img)
            chars.append(datas)

        chars = scaler.transform(chars)
        out_vec1 = clf1.predict(chars)
        out_vec2 = clf2.predict(chars)
        out_vec3 = clf3.predict(chars)

        x1 = ""
        for vec in out_vec1:
            cnt = 0
            for i in vec:
                cnt = cnt + 1
                if i == 1:
                    break
            val = ""
            if cnt < 11:
                cnt = cnt - 1
                val = chr(48 + cnt)
            elif cnt > 10 and cnt < 37:
                cnt = cnt - 11
                val = chr(65 + cnt)
            else:
                cnt -= 37
                val = chr(97 + cnt)
            x1 = x1 + val

        x2 = ""
        for vec in out_vec2:
            cnt = 0
            for i in vec:
                cnt = cnt + 1
                if i == 1:
                    break
            val = ""
            if cnt < 11:
                cnt = cnt - 1
                val = chr(48 + cnt)
            elif cnt > 10 and cnt < 37:
                cnt = cnt - 11
                val = chr(65 + cnt)
            else:
                cnt -= 37
                val = chr(97 + cnt)
            x2 = x2 + val

        x3 = ""
        for vec in out_vec3:
            cnt = 0
            for i in vec:
                cnt = cnt + 1
                if i == 1:
                    break
            val = ""
            if cnt < 11:
                cnt = cnt - 1
                val = chr(48 + cnt)
            elif cnt > 10 and cnt < 37:
                cnt = cnt - 11
                val = chr(65 + cnt)
            else:
                cnt -= 37
                val = chr(97 + cnt)
            x3 = x3 + val

        finalx = ""
        for i in range(0, len(x1)):
            l = []
            if x1[i] != 'z':
                l.append(x1[i])
            if x2[i] != 'z':
                l.append(x2[i])
            if x3[i] != 'z':
                l.append(x3[i])
            if len(l) == 0:
                finalx += 'z'
            else:
                finalx += l[0]
        ret_str += finalx
        ret_str += " "
    return ret_str
예제 #5
0
# Author:   Sebastian Law
# Date:     30-Mar-2016
# Revised:  30-Mar-2016

import features
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.learning_curve import learning_curve

data = features.get_data()
train = data.loc[data['Survived'].notnull()]
X = train.values[:, 2:]
y = train.values[:, 1]

# forest = RandomForestClassifier(n_estimators=1000,
#                                 max_depth=8,
#                                 criterion='entropy',
#                                 min_samples_split=5,
#                                 max_features=6)
forest = RandomForestClassifier(n_estimators=1000,
                                max_depth=9,
                                criterion='entropy',
                                min_samples_split=10,
                                max_features=6)

train_sizes, train_scores, test_scores = learning_curve(
    forest,
    X,
    y,
    cv=10,
예제 #6
0

def report(grid_scores, n_top=6):
    params = None
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Rank: {0}".format(i + 1))
        print("Mean score: {0:.4f} (std: {1:.4f})".format(
              score.mean_validation_score, np.std(score.cv_validation_scores)))
        print("Parameters:", score.parameters)
        print("")
        if params is None:
            params = score.parameters
    return params

data = features.get_data()
train = data.loc[data['Survived'].notnull()]
X = train.values[:, 2:]
y = train.values[:, 1]

sqrtfeat = np.sqrt(X.shape[1]).astype(int)

grid_test = {"n_estimators"      : [1000, 2000, 3000, 4000, 5000],
             "criterion"         : ["gini", "entropy"],
             "max_features"      : [sqrtfeat, sqrtfeat+1, sqrtfeat+2, sqrtfeat+3],
             "max_depth"         : [5, 7, 9, 11, 13],
             "min_samples_split" : [2, 4, 6, 8, 10]}

forest = RandomForestClassifier(oob_score=True)

grid_search = GridSearchCV(forest, grid_test, n_jobs=-1, cv=10)
#code to train the algorithms
#written by Rajat Arora 2013A7PS104P
from features import get_data, sentiment_score, add_sentiment_score
from vector import get_word_features, vectorize, get_words, naive_bayes_vector
import cPickle
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from maxent import Maxent
import numpy
from naive_bayes import NaiveBayesClassifier
from svm_classifier import SVM

data = get_data('shortdatabase.csv')
word_features = get_word_features(data['tweet'])
word_features = sorted(word_features)
word_features = sorted(word_features)
word_vector = vectorize(word_features, data['tweet'], data['sentiment'])

vector = []
labels = []
for example in word_vector:
    vector = vector + [example[0]]
    labels = labels + [example[1]]
print "Stage 1: Word Polarity"
print "training bayesian network"

words = get_words("features.txt")
bayes_vector = naive_bayes_vector(words, data['tweet'], data['sentiment'])
예제 #8
0
def run():
    f = 0
    test_data = []
    output = []
    train_data = []
    predicated_output = []
    x = 0
    y = 0
    cwd = os.getcwd()
    train_in = 'train_input6.txt'
    test_in = 'test_input6.txt'
    train_out = 'train_out6.txt'
    test_out = 'test_out6.txt'
    train_in = open(train_in, 'w')
    train_out = open(train_out, 'w')
    test_in = open(test_in, 'w')
    test_out = open(test_out, 'w')

    path = cwd+'/Fnt6/'
    for files in os.listdir(path):
        character_value = int((files[6:len(files)]))
        # print (files)
        #cnt += 1
        # print (character_value)
        real_value = 0
        # print(files)
        if character_value > 0 and character_value < 11:
            real_value = str(character_value-1)
        elif character_value > 10 and character_value < 37:
            character_value = character_value - 10
            real_value = chr(65 + character_value - 1)
        else:
            character_value = character_value - 36
            real_value = chr(97 + character_value - 1)
        # print(real_value)

        path2 = os.listdir(path  + files+'/')
        y = (len(path2))
        #print (path2)
        cnt = 0
        boudary = (y * 90) / 100
        for imges in path2:

            val = path+files+'/' + imges
            #print(val)
            img = cv2.imread(val, 0)
            #print  (img)
            # print(img)
            cnt = cnt + 1
            new_list = get_data(img)
            # print (new_list)
            if cnt > boudary:
                test_data.append(new_list)
                x = x + 1
                output.append(real_value)
                for i in new_list:
                    test_in.write(str(i))
                    test_in.write(" ")

                test_in.write('\n')
                test_out.write(str(real_value))
                test_out.write('\n')
            else:
                train_data.append(new_list)
                y = y + 1
                predicated_output.append(real_value)
                for i in new_list:
                    train_in.write(str(i))
                    train_in.write(" ")
                # train_in.write(str(new_list))
                train_in.write('\n')
                train_out.write(str(real_value))
                train_out.write('\n')

    train_out.close()
    train_in.close()
    test_out.close()
    test_in.close()
예제 #9
0
def predict(in_fname,
            lin_n_cv_iters,
            n_cv_iters,
            regularizations,
            n_labs,
            age_index,
            gender_index,
            out_fname,
            nn_out_fname=None,
            verbose=False,
            emb_fnames=None):

    if verbose:
        print "loading data"

    X_train, Y_train, X_validation, Y_validation, X_test, Y_test = features.get_data(
        in_fname)

    emb_data_list = [None]
    emb_fname_list = ['']
    if emb_fnames is not None:
        for emb_fname in emb_fnames:
            emb_data_list.append(emb.get_emb_data(emb_fname))
            emb_fname_list.append(emb_fname)

    if verbose:
        print "training, validating and testing models"

    results = []

    for e, emb_data in enumerate(emb_data_list):
        if verbose:
            print str(e)

        if verbose:
            print "-->L2"

        model = models.L2(X_train, Y_train, X_validation, Y_validation, X_test,
                          Y_test, n_labs, emb_data)
        if lin_n_cv_iters == -1:
            params = [[False, True], regularizations]
        else:
            params = [['sample', False, True],
                      ['uniform', regularizations[0], regularizations[-1]]]

        model.crossvalidate(params=params,
                            param_names=['fit_intercept', 'C'],
                            n_cv_iters=lin_n_cv_iters)
        model.test()
        s = model.summarize()
        s['emb_fname'] = emb_fname_list[e]
        results.append(s)

        if verbose:
            print "-->L1"

        model = models.L1(X_train, Y_train, X_validation, Y_validation, X_test,
                          Y_test, n_labs, age_index, gender_index, emb_data)
        if lin_n_cv_iters == -1:
            params = [[False, True], regularizations]
        else:
            params = [['sample', False, True],
                      ['uniform', regularizations[0], regularizations[-1]]]
        model.crossvalidate(params=params,
                            param_names=['fit_intercept', 'C'],
                            n_cv_iters=lin_n_cv_iters)
        model.test()
        s = model.summarize()
        s['emb_fname'] = emb_fname_list[e]
        results.append(s)

        if verbose:
            print "-->RandomForest"

        model = models.RandomForest(X_train, Y_train, X_validation,
                                    Y_validation, X_test, Y_test, emb_data)
        if n_cv_iters == -1:
            params = [[1, 10, 20], [1, 3,
                                    10], ['sqrt_n_features', 'n_features'],
                      [1, 3, 10], [1, 3, 10], [True, False],
                      ['gini', 'entropy']]
        else:
            params = [['randint', 1, 20], ['randint', 1, 10],
                      ['sample', 'sqrt_n_features', 'n_features'],
                      ['randint', 1, 10], ['randint', 1, 10],
                      ['sample', True, False], ['sample', 'gini', 'entropy']]
        param_names = [
            'n_estimators', 'max_depth', 'max_features', 'min_samples_split',
            'min_samples_leaf', 'bootstrap', 'criterion'
        ]
        model.crossvalidate(params=params,
                            param_names=param_names,
                            n_cv_iters=n_cv_iters)
        model.test()
        s = model.summarize()
        s['emb_fname'] = emb_fname_list[e]
        results.append(s)

        if emb_data is not None:
            if verbose:
                print "-->Only embeddings"

            model = models.L(emb_data[0], Y_train, emb_data[1], Y_validation,
                             emb_data[2], Y_test, None)
            if lin_n_cv_iters == -1:
                params = [['l1', 'l2'], [False, True], regularizations]
            else:
                params = [['sample', 'l1', 'l2'], ['sample', False, True],
                          ['uniform', regularizations[0], regularizations[-1]]]

            model.crossvalidate(params=params,
                                param_names=['penalty', 'fit_intercept', 'C'],
                                n_cv_iters=lin_n_cv_iters)
            model.test()
            s = model.summarize()
            s['emb_fname'] = emb_fname_list[e]
            results.append(s)

    with open(out_fname, 'w') as fout:
        fout.write(yaml.dump(results))

    if nn_out_fname is not None:
        best_model = nn.evaluate(nn_out_fname,
                                 n_cv_iters,
                                 20,
                                 X_train,
                                 Y_train,
                                 X_validation,
                                 Y_validation,
                                 X_test,
                                 Y_test,
                                 45,
                                 models=['cnn2'],
                                 random_seed=345,
                                 verbose=verbose)