示例#1
0
    def classify(self, features, classifier=None):
        feature_union = ('feats', FeatureUnion(features))

        if classifier == None:
            classifier = MultinomialNB()

        self.classifier = Pipeline([feature_union, ('classifier', classifier)])

        self.printer = Printer('Model Fitting', self.show_fitting)
        self.classifier.fit(self.X_train, self.Y_train)
        self.printer.duration()
示例#2
0
    def classify(self, features, classifier=None):
        feature_union = ('feats', FeatureUnion(features))

        if classifier == None:
            classifier = DecisionTreeClassifier(min_samples_leaf=2,
                                                max_depth=50)

        self.classifier = Pipeline([feature_union, ('classifier', classifier)])

        self.printer = Printer('Model Fitting', self.show_fitting)
        self.classifier.fit(self.X_train, self.Y_train)
        self.printer.duration()
示例#3
0
    def classify(self, features, classifier=None):
        self.tokenize()

        train_development_split = self.data.amount_train
        development_test_split = self.data.amount_train + self.data.amount_development

        self.X_train = self.X[:train_development_split]
        self.Y_train = self.Y[:train_development_split]

        self.X_development = self.X[
            train_development_split:development_test_split]
        self.Y_development = self.Y[
            train_development_split:development_test_split]

        self.X_test = self.X[development_test_split:]

        if self.data.avoid_skewness:
            Y_train = np.argmax(self.Y_train, axis=1)
            Y_train = [self.labels_dict_rev[int(i)] for i in list(Y_train)]

            self.X_train, self.Y_train = unskewedTrain(self.X_train,
                                                       self.Y_train, Y_train)
            self.X_train = np.array(self.X_train)
            self.Y_train = np.array(self.Y_train)

        ##CHANGE OPTIONS HERE
        self.model = Sequential()
        self.model.add(
            Dense(512,
                  input_shape=(self.feature_length, self.feature_dimensions)))
        self.model.add(Flatten())
        self.model.add(Dense(6))
        self.model.add(Activation('relu'))
        # self.model.add(Dropout(0.2))
        # # self.model.add(Dense(128))
        # self.model.add(Activation('relu'))
        # self.model.add(Dropout(0.1))
        # self.model.add(Dense(6, input_dim=self.feature_length,)))
        # self.model.add(Activation('sigmoid'))

        self.model.compile(loss='categorical_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])

        # Train the model
        self.printer = Printer('Model Fitting', self.show_fitting)
        self.model.fit(self.X_train,
                       self.Y_train,
                       epochs=50,
                       batch_size=128,
                       validation_split=0.2)
        self.printer.duration()
示例#4
0
    def classify(self, features, classifier=None):
        self.tokenize()

        train_development_split = self.data.amount_train
        development_test_split = self.data.amount_train + self.data.amount_development

        self.X_train = self.X[:train_development_split]
        self.Y_train = self.Y[:train_development_split]

        self.X_development = self.X[
            train_development_split:development_test_split]
        self.Y_development = self.Y[
            train_development_split:development_test_split]

        self.X_test = self.X[development_test_split:]

        if self.data.avoid_skewness:
            Y_train = np.argmax(self.Y_train, axis=1)
            Y_train = [self.labels_dict_rev[int(i)] for i in list(Y_train)]

            self.X_train, self.Y_train = unskewedTrain(self.X_train,
                                                       self.Y_train, Y_train)
            self.X_train = np.array(self.X_train)
            self.Y_train = np.array(self.Y_train)

        self.word_embeddings_layer, self.word_embeddings_index = readWordEmbeddings(
            self.data.languages, self.data.response_variable)
        if self.word_embeddings_layer == None:
            self.createWordEmbeddings()

        self.printDataInformation()

        ##CHANGE OPTIONS HERE
        self.model = Sequential()
        self.model.add(self.word_embeddings_layer)
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(self.word_embeddings_dim))
        self.model.add(Dense(self.Y.shape[1], activation='sigmoid'))

        self.model.compile(loss='categorical_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])

        # Train the model
        self.printer = Printer('Model Fitting', self.show_fitting)
        self.model.fit(self.X_train,
                       self.Y_train,
                       epochs=5,
                       batch_size=128,
                       validation_split=0.2)
        self.printer.duration()
示例#5
0
    def classify(self, features, classifier=None):

        feature_union = ('feats', FeatureUnion(features))

        if classifier == None:
            classifier = SGDClassifier(loss='hinge',
                                       random_state=42,
                                       max_iter=50,
                                       tol=None)

        self.classifier = Pipeline([feature_union, ('classifier', classifier)])
        print(self.classifier)

        self.printer = Printer('Model Fitting', self.show_fitting)
        self.classifier.fit(self.X_train, self.Y_train)
        self.printer.duration()
示例#6
0
    def __init__(self, k, method, data, features, new_classifier,
                 print_details, show_fitting):
        self.k = k
        self.kf = KFold(n_splits=self.k)

        self.print_details = print_details
        self.show_fitting = show_fitting

        self.printer = Printer(str(self.k) + '-Fold validation')

        self.method = method
        self.data = data
        self.features = features
        self.new_classifier = new_classifier

        self.validation()
示例#7
0
  def classify(self, features, classifier=None):
  
    feature_union = ('feats', FeatureUnion(
      features
    ))

    if classifier == None:
      classifier = SGDClassifier(loss='hinge', random_state=42, max_iter=50, tol=None)
      
    self.classifier = Pipeline([
      feature_union,
      ('classifier', classifier)
    ])

    self.printer = Printer('Model Fitting', self.show_fitting)

    #self.X_train, X_none, self.Y_train, Y_none = train_test_split(self.X_train, self.Y_train, test_size=0.2, random_state=42)
    #self.printer.labelDistribution(self.Y_train, '80%')

    self.classifier.fit(self.X_train, self.Y_train)  
    self.printer.duration()
示例#8
0
class DecisionTree:
    X_train = []
    Y_train = []
    X_development = []
    Y_development = []
    X_test = []

    Y_predicted = []

    labels = []

    features = []

    def __init__(self, data, predict_method, show_fitting):

        self.X_train = data.X_train
        self.Y_train = data.Y_train

        self.X_development = data.X_development
        self.Y_development = data.Y_development

        self.X_test = data.X_test

        self.labels = data.labels

        self.predict_method = predict_method
        self.show_fitting = show_fitting

    def classify(self, features, classifier=None):
        feature_union = ('feats', FeatureUnion(features))

        if classifier == None:
            classifier = DecisionTreeClassifier(min_samples_leaf=2,
                                                max_depth=50)

        self.classifier = Pipeline([feature_union, ('classifier', classifier)])

        self.printer = Printer('Model Fitting', self.show_fitting)
        self.classifier.fit(self.X_train, self.Y_train)
        self.printer.duration()

    def evaluate(self):
        if self.X_development:
            self.Y_development_predicted = self.classifier.predict(
                self.X_development)
        if self.X_test:
            self.Y_test_predicted = self.classifier.predict(self.X_test)

        self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics(
            self.Y_development, self.Y_development_predicted, self.labels)

    def printBasicEvaluation(self):
        self.printer.evaluation(self.accuracy, self.precision, self.recall,
                                self.f1score, "Basic Evaluation")

    def printClassEvaluation(self):
        self.printer.classEvaluation(self.Y_development,
                                     self.Y_development_predicted, self.labels)
示例#9
0
class NaiveBayes:
    X_train = []
    Y_train = []
    X_development = []
    Y_development = []
    X_test = []

    Y_predicted = []

    labels = []

    features = []

    def __init__(self, data, show_fitting):

        self.X_train = data.X_train
        self.Y_train = data.Y_train

        self.X_development = data.X_development
        self.Y_development = data.Y_development

        self.X_test = data.X_test

        self.labels = data.labels

        self.show_fitting = show_fitting

    def classify(self, features, classifier=None):
        feature_union = ('feats', FeatureUnion(features))

        if classifier == None:
            classifier = MultinomialNB()

        self.classifier = Pipeline([feature_union, ('classifier', classifier)])

        self.printer = Printer('Model Fitting', self.show_fitting)
        self.classifier.fit(self.X_train, self.Y_train)
        self.printer.duration()

    def evaluate(self):
        if self.X_development:
            self.Y_development_predicted = self.classifier.predict(
                self.X_development)
        if self.X_test:
            self.Y_test_predicted = self.classifier.predict(self.X_test)

        self.accuracy, self.precision, self.recall, self.f1score = metrics(
            self.Y_development, self.Y_development_predicted, self.labels)

    def printBasicEvaluation(self):
        self.printer.evaluation(self.accuracy, self.precision, self.recall,
                                self.f1score, "Basic Evaluation")

    def printClassEvaluation(self):
        self.printer.classEvaluation(self.Y_development,
                                     self.Y_development_predicted, self.labels)
示例#10
0
class Baseline:

  X_train = []
  Y_train = []
  X_development = []
  Y_development = []
  X_test = []

  labels = []

  features = []

  def __init__(self, data, predict_method, show_fitting):
    self.X_train = data.X_train
    self.Y_train = data.Y_train

    self.X_development = data.X_development
    self.Y_development = data.Y_development

    self.X_test = data.X_test

    self.labels = data.labels

    self.predict_method = predict_method
    self.show_fitting = show_fitting

    self.classifier = Classifier()

  def classify(self, features, classifier=None):
    self.printer = Printer('Model Fitting', self.show_fitting)
    self.classifier.fit(self.X_train, self.Y_train)  
    self.printer.duration()

  def evaluate(self):
    if self.X_development:
      self.Y_development_predicted = self.classifier.predict(self.X_development)
    if self.X_test:
      self.Y_test_predicted = self.classifier.predict(self.X_test)

    self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics(self.Y_development, self.Y_development_predicted, self.labels)

  def printBasicEvaluation(self):    
    self.printer.evaluation(self.accuracy, self.precision, self.recall, self.f1score, "Basic Evaluation")

  def printClassEvaluation(self):
    self.printer.classEvaluation(self.Y_development, self.Y_development_predicted, self.labels)
示例#11
0
    def validation(self):
        i = 0
        for train_index, test_index in self.kf.split(
                self.data.X[:(self.data.amount_train +
                              self.data.amount_development)]):
            i += 1

            if self.print_details >= 4:
                n_printer = Printer(str(self.k) + '-Fold, Run: ' + str(i))
            X_train, X_development = list(np.array(
                self.data.X)[train_index]), list(
                    np.array(self.data.X)[test_index])
            Y_train, Y_development = list(np.array(
                self.data.Y)[train_index]), list(
                    np.array(self.data.Y)[test_index])
            self.data.initialize(X_train, Y_train, X_development,
                                 Y_development)

            classifier = selectClassifier(self.method, self.data,
                                          self.predict_method,
                                          self.show_fitting)
            classifier.classify(self.features, self.new_classifier)
            classifier.evaluate()

            self.accuracy.append(classifier.accuracy)
            self.precision.append(classifier.precision)
            self.recall.append(classifier.recall)
            self.f1score.append(classifier.f1score)

            if self.print_details >= 5:
                classifier.printBasicEvaluation()

            if self.print_details >= 6:
                classifier.printClassEvaluation()

            if self.print_details >= 7:
                n_printer.confusionMatrix(classifier.Y_development,
                                          classifier.Y_development_predicted,
                                          self.data.labels)
            # writeResults(options.args.predict_languages, classifier.Y_development, classifier.Y_development_predicted, 'development')

            if self.print_details >= 4:
                n_printer.duration()
示例#12
0
options = Options('System parameters', con)

#Step 5: Read all custom arguments/options
options.add(name='predict_languages', _type=str, _default='esdi', _help='specify which language you want to predict')

#Step 6: Parse arguments
options.parse()

#Use random seed
random.seed(options.args.random_seed)

#Custom function to read language from input
options.args.predict_languages = languages(options.args.predict_languages)

#Print system
printer = Printer('System')
printer.system(options.args_dict)

#Step 7: Create data with default arguments
data = Data(options.args.avoid_skewness, options.args.data_folder, options.args.predict_label, options.args.data_method)

#Step 8: Add all datasources and transform them to row(Y, X) format
#Custom, should be self-made!

#Step 8.1: Add the files or folders the data is preserved in (only if available)
data.file_train = 'impression_data.csv'

#Custom function
data.languages = options.args.predict_languages

#Load data into a file
示例#13
0
class SVM:
    X_train = []
    Y_train = []
    X_development = []
    Y_development = []
    X_test = []

    Y_predicted = []

    labels = []

    features = []

    def __init__(self, data, predict_method, show_fitting):

        self.X_train = data.X_train
        self.Y_train = data.Y_train

        self.X_development = data.X_development
        self.Y_development = data.Y_development
        self.X_test = data.X_test

        self.labels = data.labels

        self.predict_method = predict_method
        self.show_fitting = show_fitting

    def classify(self, features, classifier=None):

        feature_union = ('feats', FeatureUnion(features))

        if classifier == None:
            classifier = SGDClassifier(loss='hinge',
                                       random_state=42,
                                       max_iter=50,
                                       tol=None)

        self.classifier = Pipeline([feature_union, ('classifier', classifier)])
        print(self.classifier)

        self.printer = Printer('Model Fitting', self.show_fitting)
        self.classifier.fit(self.X_train, self.Y_train)
        self.printer.duration()

    def evaluate(self):
        if self.X_development:
            self.Y_development_predicted = self.classifier.predict(
                self.X_development)
            print(self.X_development)
            #print(self.classifier.predict_proba(self.X_development))
            #print(self.Y_development[:20], self.Y_development_predicted[:20])
        if self.X_test:
            self.Y_test_predicted = self.classifier.predict(self.X_test)

        if self.predict_method == 'classification':
            self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics(
                self.Y_development, self.Y_development_predicted, self.labels)

        elif self.predict_method == 'regression':
            # self.Y_development_predicted = self.classifier.score(self.X_development, self.Y_development)
            # print(self.Y_development_predicted)
            self.mean_abs_err, self.mean_squ_err, self.r2score, self.kl_divergence = regressionMetrics(
                self.Y_development, self.Y_development_predicted, self.labels)

    def printBasicEvaluation(self):
        if self.predict_method == 'classification':
            self.printer.evaluation(self.accuracy, self.precision, self.recall,
                                    self.f1score, "Classification Evaluation")
        elif self.predict_method == 'regression':
            self.printer.regressionEvaluation(self.mean_abs_err,
                                              self.mean_squ_err, self.r2score,
                                              self.kl_divergence,
                                              "Regression Evaluation")

    def printClassEvaluation(self):
        self.printer.classEvaluation(self.Y_development,
                                     self.Y_development_predicted, self.labels)
示例#14
0
options.add(name='predict_languages',
            _type=str,
            _default='esdi',
            _help='specify which language you want to predict')

#Step 6: Parse arguments
options.parse()

#Use random seed
random.seed(options.args.random_seed)

#Custom function to read language from input
options.args.predict_languages = languages(options.args.predict_languages)

#Print system
printer = Printer('System')
printer.system(options.args_dict)

#Step 7: Create data with default arguments
data = Data(options.args.avoid_skewness, options.args.data_folder,
            options.args.predict_label, options.args.data_method)

#Step 8: Add all datasources and transform them to row(Y, X) format
#Custom, should be self-made!

#Step 8.1: Add the files or folders the data is preserved in (only if available)
if options.args.predict_languages:
    data.file_train = options.args.data_folder + 'training/'
    # data.file_development = 'eng-trial.pickle'
    # data.file_test = 'eng-test.pickle'
示例#15
0
class NeuralNetwork:
    word_embeddings_file = 'data/word_embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt'
    word_embeddings_dim = 200
    word_embeddings_layer = None
    word_embeddings_index = {}

    labels = []
    labels_dict = {}
    labels_dict_rev = {}

    Y = []

    def __init__(self, data, show_fitting):
        self.data = data

        self.X = self.data.X
        self.labels = self.data.labels

        for i, label in enumerate(self.labels):
            self.labels_dict[label] = i
            self.labels_dict_rev[i] = label

        self.Y = []
        for label in self.data.Y:
            self.Y.append(self.labels_dict[label])

        self.show_fitting = show_fitting

    def tokenize(self):
        self.X_tokenized = TextTokenizer.tokenizeTweets(self.X)  #all tweets!
        self.tokenizer = Tokenizer(split="|", )
        self.tokenizer.fit_on_texts(self.X_tokenized)
        self.sequences = self.tokenizer.texts_to_sequences(self.X_tokenized)
        self.X = pad_sequences(self.sequences)
        self.Y = to_categorical(self.Y)

    def classify(self, features, classifier=None):
        self.tokenize()

        train_development_split = self.data.amount_train
        development_test_split = self.data.amount_train + self.data.amount_development

        self.X_train = self.X[:train_development_split]
        self.Y_train = self.Y[:train_development_split]

        self.X_development = self.X[
            train_development_split:development_test_split]
        self.Y_development = self.Y[
            train_development_split:development_test_split]

        self.X_test = self.X[development_test_split:]

        if self.data.avoid_skewness:
            Y_train = np.argmax(self.Y_train, axis=1)
            Y_train = [self.labels_dict_rev[int(i)] for i in list(Y_train)]

            self.X_train, self.Y_train = unskewedTrain(self.X_train,
                                                       self.Y_train, Y_train)
            self.X_train = np.array(self.X_train)
            self.Y_train = np.array(self.Y_train)

        self.word_embeddings_layer, self.word_embeddings_index = readWordEmbeddings(
            self.data.languages, self.data.response_variable)
        if self.word_embeddings_layer == None:
            self.createWordEmbeddings()

        self.printDataInformation()

        ##CHANGE OPTIONS HERE
        self.model = Sequential()
        self.model.add(self.word_embeddings_layer)
        self.model.add(Dropout(0.2))
        self.model.add(LSTM(self.word_embeddings_dim))
        self.model.add(Dense(self.Y.shape[1], activation='sigmoid'))

        self.model.compile(loss='categorical_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])

        # Train the model
        self.printer = Printer('Model Fitting', self.show_fitting)
        self.model.fit(self.X_train,
                       self.Y_train,
                       epochs=5,
                       batch_size=128,
                       validation_split=0.2)
        self.printer.duration()

    def evaluate(self):
        self.Y_development_predicted = self.model.predict(self.X_development)

        self.Y_development_predicted = np.argmax(self.Y_development_predicted,
                                                 axis=1)
        self.Y_development_predicted = [
            self.labels_dict_rev[int(i)]
            for i in list(self.Y_development_predicted)
        ]

        self.Y_development = np.argmax(self.Y_development, axis=1)
        self.Y_development = [
            self.labels_dict_rev[int(i)] for i in list(self.Y_development)
        ]

        self.accuracy, self.precision, self.recall, self.f1score = metrics(
            self.Y_development, self.Y_development_predicted, self.labels)

    def printBasicEvaluation(self):
        self.printer.evaluation(self.accuracy, self.precision, self.recall,
                                self.f1score, "Basic Evaluation")

    def printClassEvaluation(self):
        self.printer.classEvaluation(self.Y_development,
                                     self.Y_development_predicted, self.labels)

    def printDataInformation(self):

        print('\n~~~Neural Network Distribution~~~\n')
        print('Found {} unique tokens.'.format(len(self.tokenizer.word_index)))
        print('Shape of data tensor: {}'.format(self.X.shape))
        print('Shape of label tensor: {}\n'.format(self.Y.shape))

        if len(self.word_embeddings_index) > 0:
            print('Found {} word vectors.'.format(
                len(self.word_embeddings_index)))

    def createWordEmbeddings(self):
        self.word_embeddings_index = {}
        f = open(self.word_embeddings_file, encoding="utf8")
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.word_embeddings_index[word] = coefs
        f.close()

        self.word_embeddings_matrix = np.zeros(
            (len(self.tokenizer.word_index) + 1, self.word_embeddings_dim))
        for word, i in self.tokenizer.word_index.items():
            embedding_vector = self.word_embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.word_embeddings_matrix[i] = embedding_vector

        self.word_embeddings_layer = Embedding(
            len(self.tokenizer.word_index) + 1,
            self.word_embeddings_dim,
            mask_zero=True,
            weights=[self.word_embeddings_matrix],
            trainable=True)

        writeWordEmbeddings(self.word_embeddings_layer,
                            self.word_embeddings_index, self.data.languages,
                            self.data.response_variable)
示例#16
0
con = Constants()

#Step 4: Get options and read all system arguments
options = Options('System parameters', con)

#Step 5: Read all custom arguments/options
options.add(name='predict_languages', _type=str, _default='english', _help='specify which language you want to predict')

#Step 6: Parse arguments
options.parse()

#Use random seed
random.seed(options.args.random_seed)

#Print system
printer = Printer('System')
printer.system(options.args_dict)

#Step 7: Create data with default arguments
data = Data(options.args.avoid_skewness, options.args.data_folder, options.args.predict_label, options.args.data_method

#Step 8: Add all datasources and transform them to row(Y, X) format
#Custom, should be self-made!

#Step 8.1: Add the files or folders the data is preserved in (only if available)
if options.args.predict_languages == 'english':
  data.file_train = 'eng-train.pickle'
  data.file_development = 'eng-trial.pickle'
  data.file_test = 'eng-test.pickle'
else: 
  data.file_train = 'es-train.pickle'
示例#17
0
 def classify(self, features, classifier=None):
   self.printer = Printer('Model Fitting', self.show_fitting)
   self.classifier.fit(self.X_train, self.Y_train)  
   self.printer.duration()
示例#18
0
class NeuralNetwork:
    word_embeddings_file = 'data/word_embeddings/glove.twitter.27B/glove.twitter.27B.200d.txt'
    word_embeddings_dim = 200
    word_embeddings_layer = None
    word_embeddings_index = {}

    labels = []
    labels_dict = {}
    labels_dict_rev = {}

    Y = []

    def __init__(self, data, predict_method, show_fitting):
        self.data = data

        self.X = self.data.X
        self.labels = self.data.labels

        for i, label in enumerate(self.labels):
            self.labels_dict[label] = i
            self.labels_dict_rev[i] = label

        self.Y = []
        for label in self.data.Y:
            self.Y.append(self.labels_dict[label])

        self.predict_method = predict_method
        self.show_fitting = show_fitting

    def tokenize(self):
        xy_section = []
        xy_area = []
        xy_element = []
        xy = []

        for x in self.X:
            xy_section.append(x['xy_section'])
            #xy_area.append(text_to_word_sequence('|'.join(x['xy_area']), split='|'))
            xy_area.append(x['xy_area'])
            xy_element.append(x['xy_element'])
            xy.append(x['xy'])

        #self.X_tokenized = TextTokenizer.tokenizeTweets(self.X) #all tweets!
        #print(xy_area)

        # self.X_tokenized = xy_area
        # vectorizer = TfidfVectorizer(tokenizer=TextTokenizer.tokenized, lowercase=False, analyzer='word', ngram_range=(1, 1), min_df=1)
        # self.X = vectorizer.fit_transform(self.X_tokenized)
        # self.input_length = len(vectorizer.get_feature_names())
        #print(self.X)
        self.X = sequence.pad_sequences(xy)
        self.feature_length = len(self.X[0])
        self.feature_dimensions = len(self.X[0][0])
        print(self.feature_dimensions)
        self.input_length = len(self.X)
        self.Y = to_categorical(self.Y)

    def classify(self, features, classifier=None):
        self.tokenize()

        train_development_split = self.data.amount_train
        development_test_split = self.data.amount_train + self.data.amount_development

        self.X_train = self.X[:train_development_split]
        self.Y_train = self.Y[:train_development_split]

        self.X_development = self.X[
            train_development_split:development_test_split]
        self.Y_development = self.Y[
            train_development_split:development_test_split]

        self.X_test = self.X[development_test_split:]

        if self.data.avoid_skewness:
            Y_train = np.argmax(self.Y_train, axis=1)
            Y_train = [self.labels_dict_rev[int(i)] for i in list(Y_train)]

            self.X_train, self.Y_train = unskewedTrain(self.X_train,
                                                       self.Y_train, Y_train)
            self.X_train = np.array(self.X_train)
            self.Y_train = np.array(self.Y_train)

        ##CHANGE OPTIONS HERE
        self.model = Sequential()
        self.model.add(
            Dense(512,
                  input_shape=(self.feature_length, self.feature_dimensions)))
        self.model.add(Flatten())
        self.model.add(Dense(6))
        self.model.add(Activation('relu'))
        # self.model.add(Dropout(0.2))
        # # self.model.add(Dense(128))
        # self.model.add(Activation('relu'))
        # self.model.add(Dropout(0.1))
        # self.model.add(Dense(6, input_dim=self.feature_length,)))
        # self.model.add(Activation('sigmoid'))

        self.model.compile(loss='categorical_crossentropy',
                           optimizer='adam',
                           metrics=['accuracy'])

        # Train the model
        self.printer = Printer('Model Fitting', self.show_fitting)
        self.model.fit(self.X_train,
                       self.Y_train,
                       epochs=50,
                       batch_size=128,
                       validation_split=0.2)
        self.printer.duration()

    def evaluate(self):
        self.Y_development_predicted = self.model.predict(self.X_development)

        self.Y_development_predicted = np.argmax(self.Y_development_predicted,
                                                 axis=1)
        self.Y_development_predicted = [
            self.labels_dict_rev[int(i)]
            for i in list(self.Y_development_predicted)
        ]

        self.Y_development = np.argmax(self.Y_development, axis=1)
        self.Y_development = [
            self.labels_dict_rev[int(i)] for i in list(self.Y_development)
        ]

        self.accuracy, self.precision, self.recall, self.f1score = classificationMetrics(
            self.Y_development, self.Y_development_predicted, self.labels)

    def printBasicEvaluation(self):
        self.printer.evaluation(self.accuracy, self.precision, self.recall,
                                self.f1score, "Basic Evaluation")

    def printClassEvaluation(self):
        self.printer.classEvaluation(self.Y_development,
                                     self.Y_development_predicted, self.labels)

    def printDataInformation(self):

        print('\n~~~Neural Network Distribution~~~\n')
        print('Found {} unique tokens.'.format(len(self.tokenizer.word_index)))
        print('Shape of data tensor: {}'.format(self.X.shape))
        print('Shape of label tensor: {}\n'.format(self.Y.shape))

        if len(self.word_embeddings_index) > 0:
            print('Found {} word vectors.'.format(
                len(self.word_embeddings_index)))

    def createWordEmbeddings(self):
        self.word_embeddings_index = {}
        f = open(self.word_embeddings_file, encoding="utf8")
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.word_embeddings_index[word] = coefs
        f.close()

        self.word_embeddings_matrix = np.zeros(
            (len(self.tokenizer.word_index) + 1, self.word_embeddings_dim))
        for word, i in self.tokenizer.word_index.items():
            embedding_vector = self.word_embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.word_embeddings_matrix[i] = embedding_vector

        self.word_embeddings_layer = Embedding(
            len(self.tokenizer.word_index) + 1,
            self.word_embeddings_dim,
            mask_zero=True,
            weights=[self.word_embeddings_matrix],
            trainable=True)

        writeWordEmbeddings(self.word_embeddings_layer,
                            self.word_embeddings_index, self.data.languages,
                            self.data.response_variable)
示例#19
0
class KFoldValidation:
    accuracy = []
    precision = []
    recall = []
    f1score = []

    def __init__(self, k, method, data, features, predict_method,
                 new_classifier, print_details, show_fitting):
        self.k = k
        self.kf = KFold(n_splits=self.k)

        self.print_details = print_details
        self.show_fitting = show_fitting

        self.printer = Printer(str(self.k) + '-Fold validation')

        self.method = method
        self.data = data
        self.features = features
        self.predict_method = predict_method
        self.new_classifier = new_classifier

        self.validation()

    def validation(self):
        i = 0
        for train_index, test_index in self.kf.split(
                self.data.X[:(self.data.amount_train +
                              self.data.amount_development)]):
            i += 1

            if self.print_details >= 4:
                n_printer = Printer(str(self.k) + '-Fold, Run: ' + str(i))
            X_train, X_development = list(np.array(
                self.data.X)[train_index]), list(
                    np.array(self.data.X)[test_index])
            Y_train, Y_development = list(np.array(
                self.data.Y)[train_index]), list(
                    np.array(self.data.Y)[test_index])
            self.data.initialize(X_train, Y_train, X_development,
                                 Y_development)

            classifier = selectClassifier(self.method, self.data,
                                          self.predict_method,
                                          self.show_fitting)
            classifier.classify(self.features, self.new_classifier)
            classifier.evaluate()

            self.accuracy.append(classifier.accuracy)
            self.precision.append(classifier.precision)
            self.recall.append(classifier.recall)
            self.f1score.append(classifier.f1score)

            if self.print_details >= 5:
                classifier.printBasicEvaluation()

            if self.print_details >= 6:
                classifier.printClassEvaluation()

            if self.print_details >= 7:
                n_printer.confusionMatrix(classifier.Y_development,
                                          classifier.Y_development_predicted,
                                          self.data.labels)
            # writeResults(options.args.predict_languages, classifier.Y_development, classifier.Y_development_predicted, 'development')

            if self.print_details >= 4:
                n_printer.duration()

    def printBasicEvaluation(self):
        self.printer.evaluation(
            avg(self.accuracy), avg(self.precision), avg(self.recall),
            avg(self.f1score),
            str(self.k) + "-Fold Cross Validation Evaluation")

        self.printer.duration()
示例#20
0
class SVM:
  X_train = []
  Y_train = []
  X_development = []
  Y_development = []
  X_test = []

  Y_predicted = []

  labels = []

  features = []

  def __init__(self, data, show_fitting):

    self.X_train = data.X_train
    self.Y_train = data.Y_train

    self.X_development = data.X_development
    self.Y_development = data.Y_development

    self.X_test = data.X_test

    self.labels = data.labels

    self.show_fitting =show_fitting

  def classify(self, features, classifier=None):
  
    feature_union = ('feats', FeatureUnion(
      features
    ))

    if classifier == None:
      classifier = SGDClassifier(loss='hinge', random_state=42, max_iter=50, tol=None)
      
    self.classifier = Pipeline([
      feature_union,
      ('classifier', classifier)
    ])

    self.printer = Printer('Model Fitting', self.show_fitting)

    #self.X_train, X_none, self.Y_train, Y_none = train_test_split(self.X_train, self.Y_train, test_size=0.2, random_state=42)
    #self.printer.labelDistribution(self.Y_train, '80%')

    self.classifier.fit(self.X_train, self.Y_train)  
    self.printer.duration()

  def evaluate(self):
    if self.X_development:
      self.Y_development_predicted = self.classifier.predict(self.X_development)
    if self.X_test:
      self.Y_test_predicted = self.classifier.predict(self.X_test)

    self.accuracy, self.precision, self.recall, self.f1score = metrics(self.Y_development, self.Y_development_predicted, self.labels)

  def printBasicEvaluation(self):    
    self.printer.evaluation(self.accuracy, self.precision, self.recall, self.f1score, "Basic Evaluation")

  def printClassEvaluation(self):
    self.printer.classEvaluation(self.Y_development, self.Y_development_predicted, self.labels)