Exemplo n.º 1
0
Arquivo: main.py Projeto: rabyj/IFT712
def main():
    """Do something with the project code! Have fun :) """

    preprocessor = Preprocessor()
    preprocessor.import_labeled_data("data/train.csv")

    X_total, t_total = preprocessor.encode_labels(use_new_encoder=True)
    X_train, X_test, t_train, t_test = preprocessor.train_test_split(
        X_total, t_total)

    # transform data and overwrite non-transformed data
    X_train_scaled = preprocessor.scale_data(X_train, use_new_scaler=True)
    X_test_scaled = preprocessor.scale_data(X_test, use_new_scaler=False)

    # apply PCA
    X_train = preprocessor.apply_pca(X_train_scaled,
                                     use_new_pca=True,
                                     n_components=27,
                                     whiten=False)
    X_test = preprocessor.apply_pca(X_test_scaled, use_new_pca=False)

    clf = NaiveBayes(X_train, t_train)
    clf.optimize_hyperparameters()
    clf.display_general_validation_results()
    clf.display_cv_results()
    print("Test accuracy : {:.03f}".format(clf.get_accuracy(X_test, t_test)))
    print("Test f1-score : {:.03f}".format(clf.get_f1_score(X_test, t_test)))

    label_predictions = make_new_predictions("data/test.csv", preprocessor,
                                             clf)
def main():
    # Directory where this script is located
    dirname = os.path.dirname(__file__)

    # List of terms to be ignored by the tokenizer
    ignore_terms = []

    # Collect the terms we want to ignore
    for ignore_file_name in IGNORE_TERMS_FILE_NAMES:
        with open(os.path.join(dirname, ignore_file_name)) as file:
            ignore_terms.extend(term.strip() for term in file)
            
    # Create our custom tokenizer, it receives the terms we want to ignore
    preprocessor = Preprocessor(word_chars='a-zA-Z0-9', inter_chars="'",
                                min_length=3, ignore=ignore_terms)
    
    for line in sys.stdin:
        bug_report = json.loads(line)
        old_title = bug_report['title']
        old_description = bug_report['description']

        bug_report['title'] = ' '.join(preprocessor.preprocess(old_title))
        bug_report['description'] = ' '.join(
            preprocessor.preprocess(old_description))

        print(json.dumps(bug_report))
Exemplo n.º 3
0
def predict(name, command):
    command = command.lower()

    label_path = path.join(path.dirname(path.realpath(__file__)), "intents",
                           "config", "labels", "%s_labels.json" % name)
    with open(label_path, encoding="utf8") as f:
        labels = json.load(f)

    word_vocab = Vocabulary()
    word_vocab.load("%s_word_vocab.json" % name)

    #char embedding
    char_vocab = Vocabulary()
    char_vocab.load("%s_char_vocab.json" % name)

    idx2label = dict((idx, label) for idx, label in enumerate(labels))

    preprocessor = Preprocessor(word_vocab, None, char_vocab)
    model = BiLSTMCRF(labels, len(word_vocab), len(char_vocab))
    model.load_weights('intents/config/weights/%s.hdf5' % name)

    sentence = tokenize(command)
    features = preprocessor.transform([sentence])

    p = model.predict(features)
    predicted_labels = []
    for pred in p:
        predicted_labels.append(idx2label[pred])

    for word, label in zip(sentence, predicted_labels):
        print('%s: %s' % (word, label))
Exemplo n.º 4
0
    def __init__(self):
        '''
        This constructor is supposed to initialize data members.
        Use triple quotes for function documentation. 
        '''
        self.num_train_samples=0
        self.num_feat=1
        self.num_labels=1
        self.is_trained=False
        '''Ici quelques classifieurs téstés'''
        #self.model = clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr')  
        #self.model = clf = GaussianNB()
        #self.model = clf = KNeighborsClassifier()
        #self.model = clf = QuadraticDiscriminantAnalysis()
        #self.model = clf = RandomForestClassifier(n_estimators= 80 , max_depth= 20, max_features= 'sqrt')

 
        #self.model = clf = Pipeline([('preprocessing', Preprocessor()),('classification', MLPClassifier(hidden_layer_sizes=(200,100,50,20),max_iter=1500,solver='adam', learning_rate='invscaling', activation='relu'))])
        #self.model = clf = Pipeline([('SelectKBest', Preprocessor2()),('PCA', Preprocessor()),('classification', MLPClassifier(hidden_layer_sizes=(200,100,50,20),max_iter=1500,solver='adam', learning_rate='invscaling', activation='relu'))])
       
          
        '''Ici, nous testons trois classifier  que nous mettons en competition.'''
        fancy_classifier1 = MLPClassifier(hidden_layer_sizes=(200,100,50,20),max_iter=1500,solver='adam', learning_rate='invscaling', activation='relu')
        fancy_classifier2 = Pipeline([('preprocessing', Preprocessor()),('classification', MLPClassifier(hidden_layer_sizes=(200,100,50,20),max_iter=1500,solver='adam', learning_rate='invscaling', activation='relu'))])				
        fancy_classifier3 = Pipeline([('SelectKBest', Preprocessor2()),('PCA', Preprocessor()),('classification', MLPClassifier(hidden_layer_sizes=(200,100,50,20),max_iter=1500,solver='adam', learning_rate='invscaling', activation='relu'))])
        
        self.model = clf = VotingClassifier(estimators=[('Fancy Classifier1', fancy_classifier1),('Fancy Classifier2', fancy_classifier2),('Fancy Classifier3', fancy_classifier3)],voting='soft')  
Exemplo n.º 5
0
    class Pipeline(BaseEstimator, TransformerMixin):
        """
        """
        def __init__(self,
                     numeric,
                     id=None,
                     target=None,
                     categorical=None,
                     verbose=0):
            self.created_features = None
            self.id = id
            self.target = target
            self.categorical = categorical
            self.numeric = numeric
            self.verbose = verbose

            self.feature_generator = None
            self.preprocessor = None

        def fit_transform(self, df, y=None, **fit_params):
            with Timer('pipelines.Pipeline.fit_transform:', self.verbose):
                self.feature_generator = FeatureGenerator(
                    id=self.id,
                    numeric=self.numeric,
                    categorical=self.categorical,
                    target=self.target,
                    verbose=self.verbose,
                )
                df_features = self.feature_generator.fit_transform(df)

                self.preprocessor = Preprocessor(
                    id=self.id,
                    numeric=self.numeric,
                    categorical=self.categorical,
                    target=self.target,
                    verbose=self.verbose,
                )
                x = self.preprocessor.fit_transform(df_features)
                return x

        def transform(self, df):
            with Timer('pipelines.Pipeline.transform:', self.verbose):
                if self.feature_generator is None:
                    raise NotFittedError(
                        f'feature_generator = {self.feature_generator}')
                if self.preprocessor is None:
                    raise NotFittedError(f'preprocessor = {self.preprocessor}')

                df_features = self.feature_generator.transform(df)
                x = self.preprocessor.transform(df_features)
                return x

        def fit(self, x, y=None, **fit_params):
            return self

        def get_feature_names(self):
            return self.created_features
Exemplo n.º 6
0
    def test_preprocessing(self):
        """Test full preprocessing pipeline."""
        data_path = '../data'
        prepper = Preprocessor(data_path)

        prepper.execute()

        files = os.listdir(data_path + '/')

        assert 'm5.db' in files
Exemplo n.º 7
0
 def __init__(self, max_comments_per_subreddit=100000):
     self.getter = CommentGetter()
     self.preprocessor = Preprocessor()
     self.max_comments_per_subreddit = max_comments_per_subreddit
     # if load_from_files:
     #     self.dictionary = Dictionary(
     #         wordToIndexDict=pickler.loadData('wordToIndexDict'),
     #         indexToWordDict=pickler.loadData('indexToWordDict'))
     # else:
     #     self.dictionary = Dictionary()
     self.dictionary = Dictionary()
Exemplo n.º 8
0
def preprocess_data(vocab_size, batch_size, num_workers=0, max_input_len=400,
                    max_target_len=100):
    
    p = Preprocessor(chosen_dataset)

    print('preprocessing started')
    train_set, test_set, validation_set = p.create_data_loaders(vocab_size, batch_size, num_workers=num_workers,
                                                                max_input_len=max_input_len, max_target_len=max_target_len)
    print('preprocessing finished')
    
    return p, train_set, test_set, validation_set
def preprocess_signals(data: pd.DataFrame, args: Namespace) -> pd.DataFrame:
    with Cache.configure(args.currency_pair, args.tick_rate):
        signal_strategy = SignalStrategyFactory.get(
            'ma', **signal_strat_argument_parser(args))
        stopping_strat_argument_parser(args)
        stop_strategy = StoppingStrategyFactory.get(
            args.stopping_strat, **stopping_strat_argument_parser(args))
        preprocessor = Preprocessor(signal_strategy, stop_strategy)
        if args.no_cache:
            return preprocessor.find_signals(data)
        else:
            return preprocessor.get_signals(data)
Exemplo n.º 10
0
 def test_get_emoji_score(self):
     prep5 = Preprocessor(corpus1, remove_short_tweets=False, verbose_emoji=False)
     self.assertEqual(prep5.get_emoji_score(0)["positive"], 2)
     self.assertEqual(prep5.get_emoji_score(0)["negative"], 0)
     self.assertEqual(prep5.get_emoji_score(1)["positive"], 0)
     self.assertEqual(prep5.get_emoji_score(1)["negative"], 1)
     self.assertEqual(prep5.get_emoji_score(2)["positive"], 0)
     self.assertEqual(prep5.get_emoji_score(2)["negative"], 0)
     prep5 = Preprocessor(corpus1, remove_short_tweets=False, verbose_emoji=True)
     self.assertEqual(prep5.get_emoji_score(0)["positive"], 2)
     self.assertEqual(prep5.get_emoji_score(0)["negative"], 0)
     self.assertEqual(prep5.get_emoji_score(1)["positive"], 0)
     self.assertEqual(prep5.get_emoji_score(1)["negative"], 1)
     self.assertEqual(prep5.get_emoji_score(2)["positive"], 0)
     self.assertEqual(prep5.get_emoji_score(2)["negative"], 0)
def test_keras_feature_extractor_extract_features():
    ext = KerasFeatureExtractor(TEST_NET_ID, ckpt_path=TEST_CKPT_PATH)

    ds = Dataset(name=TEST_NAME, prefix=TEST_PREFIX, batch_size=8)
    ds.initialize(fp=TEST_SOURCES)
    ds.load_images()
    imgs = [e.image for e in ds.elements]

    prepro = Preprocessor()

    imgs = prepro.preprocess_images(imgs)

    result = ext.extract_features(images=imgs)
    assert isinstance(result, np.ndarray) == True
    assert len(result) == ds.count
Exemplo n.º 12
0
def main():
    args = parse()

    corpus = Preprocessor(args.directory).run()

    lengths: Dict[str, int] = {
        "<50": 0,
        "50-99": 0,
        "100-199": 0,
        "200-399": 0,
        "400+": 0
    }

    for blog, _ in corpus:
        word_count = len(blog)
        if word_count <= 50:
            lengths["<50"] += 1
        elif word_count <= 99:
            lengths["50-99"] += 1
        elif word_count <= 199:
            lengths["100-199"] += 1
        elif word_count <= 399:
            lengths["200-399"] += 1
        else:
            lengths["400+"] += 1

    # min, max
    lengths["min"] = min((len(blog) for blog, _ in corpus))
    lengths["max"] = max((len(blog) for blog, _ in corpus))

    with open(os.path.join(args.results, "lengths.json"), "w") as file:
        json.dump(lengths, file, indent=4, ensure_ascii=True)
Exemplo n.º 13
0
    def __init__(self,
                 classifier=RandomForestClassifier(n_estimators=180,
                                                   max_depth=None,
                                                   max_features='auto')):
        print("CONSTRUCTEUR MODELE")
        self.classifierUsed = classifier

        self.preprocess = Preprocessor()
        #self.clf = classifier
        PipelineUse = Pipeline([('preprocessing', self.preprocess),
                                ('classification', self.classifierUsed)])

        self.clf = VotingClassifier(estimators=[
            ('Gradient Tree Boosting', ensemble.GradientBoostingClassifier()),
            ('Pipeline', PipelineUse),
            ('RandomForestClassifier',
             RandomForestClassifier(n_estimators=180,
                                    max_depth=None,
                                    max_features='auto'))
        ],
                                    voting='soft')

        self.num_train_samples = 0
        self.num_feat = 1
        self.num_labels = 1
        self.is_trained = False
Exemplo n.º 14
0
def run(schema_path, name, sample_size, batch_size, epochs):
    dataset = Dataset(schema_path, name)
    labels, data = dataset.get_data()

    X = [x['words'] for x in data]
    y = [x['labels'] for x in data]

    word_vocab = Vocabulary()
    word_vocab.build_vocab([w for command in X for w in command])

    #char embedding
    char_vocab = Vocabulary()
    char_vocab.build_vocab([ch for w in word_vocab for ch in w])

    labels2idx = dict((label, idx) for idx, label in enumerate(labels))
    idx2label = dict((idx, label) for idx, label in enumerate(labels))

    preprocessor = Preprocessor(word_vocab, labels2idx, char_vocab)
    model = BiLSTMCRF(labels, len(word_vocab), len(char_vocab))
    trainer = Trainer(model, X, y, preprocessor.transform, split=[0.75, 0.95])

    trainer.train(batch_size, epochs)
    trainer.evaluate(idx2label)

    model.save_weights(name)
    dataset.save(X[:sample_size], labels)
    word_vocab.save("%s_word_vocab.json" % name)
    char_vocab.save("%s_char_vocab.json" % name)
    def evaluate_decision(self, model_data):
        """
        Using the results from the model_decision, prompt the user and decide what to do next.
        """
        from preprocessing import Preprocessor
        from feature_selection import FeatureSelector
        from sklearn.pipeline import Pipeline
        
        preprocess = Preprocessor(self)
        feature_select = FeatureSelector(self)
        model_name, model, model_performance = model_data
        steps = [
            ('preprocess', preprocess),
            ('feature_select', feature_select),
            ('clf', model)

        ]
        pipeline = Pipeline(steps) # this is our classifier pipeline that transforms data and makes predictions.

        metric = self.parameters['final_min_performance_metric']
        model_performance = model_performance[metric]
        min_performance = self.parameters['final_min_performance']
        if model_performance > min_performance:
            print('Minimum performance required:', min_performance, metric)
            print('Model performance', model_performance, metric)
            print('The model meets minimum requirements!')
            deploy = input('Type "C" to cancel, or type anything else to save the model: ')
            if deploy.strip().lower() != 'c':
                file_name = input('Enter file name:')
                # save the model so it can be easily loaded next time and used to make predictions.
                self.file_handler.save_model(file_name, pipeline)
Exemplo n.º 16
0
    def predictFromModel(self):
        try:
            self.pred_data_val.deletePredictionFile()
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = Data_Getter_Prediction(self.file_object,
                                                 self.log_writer)
            data = data_getter.get_data()
            preprocessor = Preprocessor(self.file_object, self.log_writer)
            is_null_present = preprocessor.is_null_present(data)
            if (is_null_present):
                data = preprocessor.impute_missing_values(data)

            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(
                data)
            data = preprocessor.remove_columns(data, cols_to_drop)

            file_loader = File_operation(self.file_object, self.log_writer)
            model = file_loader.load_model('my_model')

            X, y = preprocessor.separate_label_feature(data, 'Calories')
            result = list(model.predict(X.values))
            result = pd.Series(result, name='Predictions')
            path = "Prediction_Output_File/Predictions.csv"
            result.to_csv("Prediction_Output_File/Predictions.csv",
                          header=True,
                          mode='a+')
            self.log_writer.log(self.file_object, 'End of Prediction')
        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex

        return path, result.head().to_json(orient="records")
Exemplo n.º 17
0
    def trainingModel(self):

        self.log_writer.log(self.file_object, 'Start of Training')
        try:

            data_getter = Data_Getter(self.file_object, self.log_writer)
            data = data_getter.get_data()
            preprocessor = Preprocessor(self.file_object, self.log_writer)
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Calories')
            is_null_present = preprocessor.is_null_present(X)
            if (is_null_present):
                X = preprocessor.impute_missing_values(X)
            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X)
            X = preprocessor.remove_columns(X, cols_to_drop)

            x_train, x_test, y_train, y_test = train_test_split(
                X, Y, test_size=1 / 3, random_state=355)
            model_finder = Model_Finder(self.file_object, self.log_writer)
            best_model_name, best_model = model_finder.get_best_model(
                x_train, y_train, x_test, y_test)
            file_op = File_operation(self.file_object, self.log_writer)
            save_model = file_op.save_model(best_model, best_model_name)

            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:

            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Exemplo n.º 18
0
def prepare_nn_patterns(lemmatizer: Preprocessor):
    try:
        with open('nn_patterns.pickle', 'rb') as handle:
            nn_patterns = pickle.load(handle)
    except:
        nn_patterns = lemmatizer.lemmatize_all_patterns()
        with open('nn_patterns.pickle', 'wb') as handle:
            pickle.dump(nn_patterns, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return nn_patterns
Exemplo n.º 19
0
 def __init__(self, preprocessing_chain_after_squaring,
              preprocessing_chain_after_flattening, preprocessing_options,
              training_set, **kwargs):
     self.preprocessor = Preprocessor(preprocessing_chain_after_squaring,
                                      preprocessing_chain_after_flattening,
                                      preprocessing_options, training_set)
     self.kwargs = kwargs
     self.__dict__.update(kwargs)
     if training_set is not None:
         self.fit(*training_set)
Exemplo n.º 20
0
 def stem(self,
          text,
          poetic_preprocessing=False,
          remove_tek=False,
          tek_string=None):
     preprocessor = Preprocessor()
     text = preprocessor.compulsory_preprocessing(text)
     if poetic_preprocessing:
         text = preprocessor.poetic_preprocessing(text,
                                                  remove_tek=remove_tek,
                                                  tek_string=tek_string)
     l = SentenceTokenizer(text)
     if len(l) == 1:
         sentence = l[0]
         return self.stem_word(sentence)
     else:
         a = []
         for sentence in l:
             a.append(self.stem(sentence))
         return a
 def __init__(self):
     '''
     This constructor is supposed to initialize data members.
     Use triple quotes for function documentation. 
     '''
     self.num_train_samples = 121499
     self.num_feat = 56  #attributs
     self.num_labels = 1  #classes
     self.is_trained = False
     self.clf = Pipeline([('preprocessor', Preprocessor()),
                          ('class', SGDClassifier())])
Exemplo n.º 22
0
 def test(self):
     mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
     test_results = self.out_image.eval(
         feed_dict={
             self.x: mnist.test.images,
             self.y_: mnist.test.labels,
             self.keep_prob: 1.0
         })
     combined_images = np.zeros(
         (0, 56))  # Empty array of 'correct' dimensions for concatenation
     for i in range(10):
         test_image = np.array(test_results[i]).reshape((28, 28))
         test_image = self.post_process(test_image)
         actual_image = np.array(mnist.test.images[i]).reshape(
             (28, 28)) * 255
         actual_image = np.rot90(actual_image)
         # Stack output image with actual horizontally, for comparison
         image_column = np.hstack((test_image, actual_image))
         combined_images = np.vstack((combined_images, image_column))
     Preprocessor.displayImage(combined_images)
Exemplo n.º 23
0
def preprocess_data_split(split, output_folder):
    # 1. Create dataframes
    dataframes = _get_dataframes(split)
    # 2. Preprocessing pipeline
    processed = Preprocessor().preprocess(dataframes, WORD_FREQ_FILE_PATH)
    # 3. Convert to spacy
    spacy_format = Converter().to_spacy(processed)
    # 4. Save the properly formatted output
    output_file_path = os.path.join(output_folder, '{}_doc'.format(split))
    with open(output_file_path, 'w+') as f:
        json.dump(spacy_format, f)
Exemplo n.º 24
0
        def fit_transform(self, df, y=None, **fit_params):
            with Timer('pipelines.Pipeline.fit_transform:', self.verbose):
                self.feature_generator = FeatureGenerator(
                    id=self.id,
                    numeric=self.numeric,
                    categorical=self.categorical,
                    target=self.target,
                    verbose=self.verbose,
                )
                df_features = self.feature_generator.fit_transform(df)

                self.preprocessor = Preprocessor(
                    id=self.id,
                    numeric=self.numeric,
                    categorical=self.categorical,
                    target=self.target,
                    verbose=self.verbose,
                )
                x = self.preprocessor.fit_transform(df_features)
                return x
Exemplo n.º 25
0
    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path
        self.preprocessor = Preprocessor()
        self.trn = pd.DataFrame(columns=Classifier._COLS)  # Read data_frame
        self.tst = pd.DataFrame(columns=Classifier._COLS)  # Read data_frame
        self.trn_gs = pd.DataFrame(columns=Classifier._GS_COLS)  # Known labels
        self.tst_gs = pd.DataFrame(columns=Classifier._GS_COLS)  # Known labels
        self.tok_trn = []
        self.tok_tst = []

        self.feature_extractor = FeatureExtractor()
        self.jaccard = Jaccard()
        self.rfr = RFR()
        self.nn = MLPRegressor(hidden_layer_sizes=(100, 30, 30),
                               validation_fraction=0.3,
                               alpha=0.3,
                               warm_start=False,
                               max_iter=1000,
                               activation='logistic')
Exemplo n.º 26
0
def generate_intervals():
    # Script to generate a given number of frequency intervals given 
    # a frequent items output from Borgelt
    intervals = 30
    output_folders = []
    for interval, res in enumerate(Preprocessor.triple_intervals('../tmp/observed_frequent_items.out', intervals=intervals)):

        # Triple set of 1/intervals part of the data
        interval_id = 'interval_' + str(intervals) + '_' + str(interval)
        output_folder = cross_validate_disc_version(env.BORGELT_ALGORITHM, env.AOL_MERGED_FILE, sample_pct=-100000, iterations=2, restricted_triples=res, extra_id=interval_id, min_support=-30)
        output_folders.append(output_folder)
    print 'output folders: ', output_folders
Exemplo n.º 27
0
    def __init__(self, learning_rate=1e-4, dataset_name="offices"):
        self.x = tf.placeholder(tf.float32, shape=[None, 304, 228, 3])
        self.y_ = tf.placeholder(tf.float32, shape=[None, 74, 55])
        self.learning_rate = learning_rate
        self.weights = dict()

        self._create_network()
        print("Creating session and intitalizing weights... ")
        self.sess = tf.InteractiveSession()
        self._initialize_weights()
        self.saver = tf.train.Saver(self.weights)
        print("Done initializing session.")

        # Overwrite this when we load model with global step value
        # Increased to 0 when training, so first step is first image in batch
        self.step = -1

        # Load dataset into memory prior to training / testing
        print("Loading dataset and batching unit... ")
        self.pp = Preprocessor(dataset_name, greyscale=False)
        print("Done loading dataset.")
def test_search_lanes():
    # img_path = '../test_output_folder/bin_img.jpg'
    img_path = '../test_images/straight_lines1.jpg'
    img = cv2.imread(img_path)
    preprocessor = Preprocessor()
    pimg = preprocessor.preprocess_image(img)
    bimg, _ = color_n_edge_threshold(pimg)
    lanedetector = LaneDetector()

    # search_lanes(img,self.is_first_pass=False,l=l,r=r);

    # plt.subplot(121);
    # plt.title("Original Image")
    # plt.imshow(img[:,:,::-1]);

    # # preprocessed_img = preprocess_image(img);
    # plt.subplot(122);
    plt.title("Visulized Image")
    plt.imshow(bimg, cmap='gray')

    l, r, vis_img = lanedetector.search_lanes(bimg, is_first_pass=True)
Exemplo n.º 29
0
 def test_reduce_repeated_letters_corpus(self):
     prep6 = Preprocessor(["Teeest test"], remove_short_tweets=False, reduce_chars=True)
     corpus = prep6.preprocessed_corpus()[0]
     self.assertEqual(corpus[0], ["test", "test"])
     prep6 = Preprocessor(["Teeest test"], remove_short_tweets=False, reduce_chars=False)
     corpus = prep6.preprocessed_corpus()[0]
     self.assertEqual(corpus[0], ["teeest", "test"])
    def _build(self, classifier_model, X, y=None):
        """
        Inner build function that builds a single model.
        """

        model = Pipeline([
            ('preprocessor', Preprocessor()),
            ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 2))),
            ('classifier', classifier_model),
        ])
        model.fit(X, y)

        return model
    def train(self):
        print(path)
        if self.path == 'data/bank-additional-full.csv':
            data = pd.read_csv('data/bank-additional-full.csv', sep=';')
            data['y'] = data['y'].map({'yes': 1, 'no': 0})
        else:
            data = pd.read_csv('path')

        process = Preprocessor(data)
        data, columns = process._divide_data(data, self.label)
        categorical, numerical = process._classify_data(columns, data)
        transformed, y = process._preprocess_data(categorical,
                                                  numerical,
                                                  preprocessor=scaler)

        #dealing with our imbalanced data by oversampling the data set
        model = Modelling()
        x_train, x_test, y_train, y_test = model._splitdata(transformed,
                                                            y,
                                                            size=self.validate)
        X, y = DealwithSample(x_train, y_train, method=sampler)

        model = model.Prediction(X, x_test, y, y_test, method=self.model)
Exemplo n.º 32
0
def main():
    """
    Invoke this module as a script.
    """
    args = parse_args()
    _set_up_logging(args)

    random.seed(a=args.seed)

    preprocessor = Preprocessor(lowercase=args.lowercase,
                                unknown_label_id=0,
                                ngram_range=args.ngram_range)

    pooling_classifier = PoolingClassifier(max_epochs=args.max_epochs,
                                           validation_interval=args.validation_interval,
                                           validation_metric=args.validation_metric,
                                           early_stop=args.early_stop,
                                           early_stop_patience=args.early_stop_patience,
                                           early_stop_tol=args.early_stop_tol,
                                           word_embedding_size=args.word_embedding_size,
                                           pooling_method=args.pooling_method,
                                           average_dropout=args.average_dropout,
                                           random_state=args.seed,
                                           model_path=args.model_path)

    _, _, _, X, y = preprocessor.preprocess_file(file_path=args.train_file,
                                                 with_labels=True,
                                                 min_seq_length=args.min_seq_length,
                                                 max_seq_length=args.max_seq_length)

    pooling_classifier.fit(X, y)

    logger.info("Preparing test file.")
    # prepare test file entries into X_test
    ids, X_texts, _, X, y = preprocessor.preprocess_file(file_path=args.test_file,
                                                         with_labels=False,
                                                         fit=False)

    logger.info("Predicting test file.")
    # make predictions for entries in test file
    y_probs = pooling_classifier.predict_proba(X)
    y_label_ids = pooling_classifier.predict(X)

    # converting label ids back to string labels
    y_labels = preprocessor.inverse_transform_labels(y_label_ids)

    preprocessor.write_file(file_path=args.model_path + ".test.predictions.csv",
                            ids=ids,
                            X_texts=X_texts,
                            y_probs=y_probs,
                            y_labels=y_labels,
                            verbose=False)
Exemplo n.º 33
0
    ax2.set_title(filter_name)
    ax2.axis('off')
    ax2.set_adjustable('box-forced')

if __name__ == "__main__":
	# Settings
	box_size = 80
	scale_factor = 0.8
	mask_scale = 0.2
	plot = False

	box_size *= scale_factor
	
	# Load Preprocessor
	print("Preprocessing")
	p = Preprocessor("../images/slum_image.jpg")
	p.scale_image(scale_factor)
	p.exposure_equalization(method="equal")
	p.convert_color("RGB","HSV")
	p.save_current_as("structure")

	p.reset()
	p.scale_image(mask_scale)
	p.exposure_equalization(method="equal")
	p.convert_color("RGB","HSV")
	p.save_current_as("mask")

	# Load images for mask and structure information
	img2 = p.get_version("mask")[:,:,0]
	img = p.get_version("structure")[:,:,2]
Exemplo n.º 34
0
class Tagger():

    def __init__(self,
                 config_path=None,
                 nb_encoding_layers = 1,
                 nb_dense_dims = 30,
                 batch_size = 100,
                 nb_left_tokens = 2,
                 nb_right_tokens = 2,
                 nb_embedding_dims = 150,
                 model_dir = 'new_model',
                 postcorrect = True,
                 include_token = True,
                 include_context = True,
                 include_lemma = True,
                 include_pos = True,
                 include_morph = True,
                 include_dev = True,
                 include_test = True,
                 nb_filters = 100,
                 filter_length = 3,
                 focus_repr = 'recurrent',
                 dropout_level = .1,
                 load = False,
                 nb_epochs = 15,
                 min_token_freq_emb = 5,
                 halve_lr_at = 10,
                 max_token_len = None,
                 min_lem_cnt = 1,
                 ):
        
        if load:
            if model_dir:
                self.config_path = os.sep.join((model_dir, 'config.txt'))
            else:
                raise ValueError('To load a tagger you, must specify model_name!')
        else:
            self.config_path = config_path

        if not config_path and not load:
            self.nb_encoding_layers = int(nb_encoding_layers)
            self.nb_dense_dims = int(nb_dense_dims)
            self.batch_size = int(batch_size)
            self.nb_left_tokens = int(nb_left_tokens)
            self.nb_right_tokens = int(nb_right_tokens)
            self.nb_context_tokens = self.nb_left_tokens + self.nb_right_tokens
            self.nb_embedding_dims = int(nb_embedding_dims)
            self.model_dir = model_dir
            self.postcorrect = bool(postcorrect)
            self.nb_filters = int(nb_filters)
            self.filter_length = int(filter_length)
            self.focus_repr = focus_repr
            self.dropout_level = float(dropout_level)
            self.include_token = include_token
            self.include_context = include_context
            self.include_lemma = include_lemma
            self.include_pos = include_pos
            self.include_morph = include_morph
            self.include_dev = include_dev
            self.include_test = include_test
            self.min_token_freq_emb = min_token_freq_emb
            self.nb_epochs = int(nb_epochs)
            self.halve_lr_at = int(halve_lr_at)
            self.max_token_len = int(max_token_len)
            self.min_lem_cnt = int(min_lem_cnt)

        else:
            param_dict = utils.get_param_dict(self.config_path)
            print('Using params from config file: ', param_dict)
            self.nb_encoding_layers = int(param_dict['nb_encoding_layers'])
            self.nb_epochs = int(param_dict['nb_epochs'])
            self.nb_dense_dims = int(param_dict['nb_dense_dims'])
            self.batch_size = int(param_dict['batch_size'])
            self.nb_left_tokens = int(param_dict['nb_left_tokens'])
            self.nb_right_tokens = int(param_dict['nb_right_tokens'])
            self.nb_context_tokens = self.nb_left_tokens + self.nb_right_tokens
            self.nb_embedding_dims = int(param_dict['nb_embedding_dims'])
            self.model_dir = param_dict['model_dir']
            self.postcorrect = bool(param_dict['postcorrect'])
            self.nb_filters = int(param_dict['nb_filters'])
            self.filter_length = int(param_dict['filter_length'])
            self.focus_repr = param_dict['focus_repr']
            self.dropout_level = float(param_dict['dropout_level'])
            self.include_token = param_dict['include_token']
            self.include_context = param_dict['include_context']
            self.include_lemma = param_dict['include_lemma']
            self.include_pos = param_dict['include_pos']
            self.include_morph = param_dict['include_morph']
            self.include_dev = param_dict['include_dev']
            self.include_test = param_dict['include_test']
            self.min_token_freq_emb = int(param_dict['min_token_freq_emb'])
            self.halve_lr_at = int(param_dict['halve_lr_at'])
            self.max_token_len = int(param_dict['max_token_len'])
            self.min_lem_cnt = int(param_dict['min_lem_cnt'])
        
        # create a models directory if it isn't there already:
        if not os.path.isdir(self.model_dir):
            os.mkdir(model_dir)

        # initialize:
        self.setup = False
        self.curr_nb_epochs = 0

        self.train_tokens, self.dev_tokens, self.test_tokens = None, None, None
        self.train_lemmas, self.dev_lemmas, self.test_lemmas = None, None, None
        self.train_pos, self.dev_pos, self.test_pos = None, None, None
        self.train_morph, self.dev_morph, self.test_morph = None, None, None

        if load:
            self.load()

    def load(self):
        print('Re-loading preprocessor...')
        self.preprocessor = pickle.load(open(os.sep.join((self.model_dir, \
                                    'preprocessor.p')), 'rb'))
        print('Re-loading pretrainer...')
        self.pretrainer = pickle.load(open(os.sep.join((self.model_dir, \
                                    'pretrainer.p')), 'rb'))
        print('Re-building model...')
        self.model = model_from_json(open(os.sep.join((self.model_dir, 'model_architecture.json'))).read())
        self.model.load_weights(os.sep.join((self.model_dir, 'model_weights.hdf5')))

        loss_dict = {}
        idx_cnt = 0
        if self.include_lemma:
            loss_dict['lemma_out'] = 'categorical_crossentropy'
            self.lemma_out_idx = idx_cnt
            idx_cnt += 1
            print('Loading known lemmas...')
            self.known_lemmas = pickle.load(open(os.sep.join((self.model_dir, \
                                    'known_lemmas.p')), 'rb'))

        if self.include_pos:
            loss_dict['pos_out'] = 'categorical_crossentropy'
            self.pos_out_idx = idx_cnt
            idx_cnt += 1
        if self.include_morph:
            self.morph_out_idx = idx_cnt
            idx_cnt += 1
            if self.include_morph == 'label':
              loss_dict['morph_out'] = 'categorical_crossentropy'
            elif self.include_morph == 'multilabel':
              loss_dict['morph_out'] = 'binary_crossentropy'

        self.model.compile(optimizer='adadelta', loss=loss_dict)

    def setup_to_train(self, train_data=None, dev_data=None, test_data=None):
        # create a model directory:
        if os.path.isdir(self.model_dir):
            shutil.rmtree(self.model_dir)
        os.mkdir(self.model_dir)

        self.train_tokens = train_data['token']
        if self.include_test:
            self.test_tokens = test_data['token']
        if self.include_dev:
            self.dev_tokens = dev_data['token']

        idx_cnt = 0
        if self.include_lemma:
            self.lemma_out_idx = idx_cnt
            idx_cnt += 1
            self.train_lemmas = train_data['lemma']
            self.known_lemmas = set(self.train_lemmas)
            if self.include_dev:
                self.dev_lemmas = dev_data['lemma']            
            if self.include_test:
                self.test_lemmas = test_data['lemma']
        if self.include_pos:
            self.pos_out_idx = idx_cnt
            idx_cnt += 1
            self.train_pos = train_data['pos']
            if self.include_dev:
                self.dev_pos = dev_data['pos']
            if self.include_test:
                self.test_pos = test_data['pos']
        if self.include_morph:
            self.morph_out_idx = idx_cnt
            self.train_morph = train_data['morph']
            if self.include_dev:
                self.dev_morph = dev_data['morph']
            if self.include_test:
                self.test_morph = test_data['morph']

        self.preprocessor = Preprocessor().fit(tokens=self.train_tokens,
                                               lemmas=self.train_lemmas,
                                               pos=self.train_pos,
                                               morph=self.train_morph,
                                               include_lemma=self.include_lemma,
                                               include_morph=self.include_morph,
                                               max_token_len=self.max_token_len,
                                               focus_repr=self.focus_repr,
                                               min_lem_cnt=self.min_lem_cnt,
                                               )
        self.pretrainer = Pretrainer(nb_left_tokens=self.nb_left_tokens,
                                     nb_right_tokens=self.nb_right_tokens,
                                     size=self.nb_embedding_dims,
                                     minimum_count=self.min_token_freq_emb)
        self.pretrainer.fit(tokens=self.train_tokens)

        train_transformed = self.preprocessor.transform(tokens=self.train_tokens,
                                               lemmas=self.train_lemmas,
                                               pos=self.train_pos,
                                               morph=self.train_morph)
        if self.include_dev:
            dev_transformed = self.preprocessor.transform(tokens=self.dev_tokens,
                                        lemmas=self.dev_lemmas,
                                        pos=self.dev_pos,
                                        morph=self.dev_morph)
        if self.include_test:
            test_transformed = self.preprocessor.transform(tokens=self.test_tokens,
                                        lemmas=self.test_lemmas,
                                        pos=self.test_pos,
                                        morph=self.test_morph)

        self.train_X_focus = train_transformed['X_focus']
        if self.include_dev:
            self.dev_X_focus = dev_transformed['X_focus']
        if self.include_test:
            self.test_X_focus = test_transformed['X_focus']

        if self.include_lemma:
            self.train_X_lemma = train_transformed['X_lemma']
            if self.include_dev:
                self.dev_X_lemma = dev_transformed['X_lemma']
            if self.include_test:
                self.test_X_lemma = test_transformed['X_lemma']

        if self.include_pos:
            self.train_X_pos = train_transformed['X_pos']
            if self.include_dev:
                self.dev_X_pos = dev_transformed['X_pos']
            if self.include_test:
                self.test_X_pos = test_transformed['X_pos']

        if self.include_morph:
            self.train_X_morph = train_transformed['X_morph']
            if self.include_dev:
                self.dev_X_morph = dev_transformed['X_morph']
            if self.include_test:
                self.test_X_morph = test_transformed['X_morph']

        self.train_contexts = self.pretrainer.transform(tokens=self.train_tokens)
        if self.include_dev:
            self.dev_contexts = self.pretrainer.transform(tokens=self.dev_tokens)
        if self.include_test:
            self.test_contexts = self.pretrainer.transform(tokens=self.test_tokens)
        
        print('Building model...')
        nb_tags = None
        try:
            nb_tags = len(self.preprocessor.pos_encoder.classes_)
        except AttributeError:
            pass
        nb_morph_cats = None
        try:
            nb_morph_cats = self.preprocessor.nb_morph_cats
        except AttributeError:
            pass
        max_token_len, token_char_dict = None, None
        try:
            max_token_len = self.preprocessor.max_token_len
            token_char_dict = self.preprocessor.token_char_dict
        except AttributeError:
            pass
        max_lemma_len, lemma_char_dict = None, None
        try:
            max_lemma_len = self.preprocessor.max_lemma_len
            lemma_char_dict = self.preprocessor.lemma_char_dict
        except AttributeError:
            pass
        nb_lemmas = None
        try:
            nb_lemmas = len(self.preprocessor.lemma_encoder.classes_)
        except AttributeError:
            pass
        self.model = build_model(token_len=max_token_len,
                             token_char_vector_dict=token_char_dict,
                             lemma_len=max_lemma_len,
                             nb_tags=nb_tags,
                             nb_morph_cats=nb_morph_cats,
                             lemma_char_vector_dict=lemma_char_dict,
                             nb_encoding_layers=self.nb_encoding_layers,
                             nb_dense_dims=self.nb_dense_dims,
                             nb_embedding_dims=self.nb_embedding_dims,
                             nb_train_tokens=len(self.pretrainer.train_token_vocab),
                             nb_context_tokens=self.nb_context_tokens,
                             pretrained_embeddings=self.pretrainer.pretrained_embeddings,
                             include_token=self.include_token,
                             include_context=self.include_context,
                             include_lemma=self.include_lemma,
                             include_pos=self.include_pos,
                             include_morph=self.include_morph,
                             nb_filters = self.nb_filters,
                             filter_length = self.filter_length,
                             focus_repr = self.focus_repr,
                             dropout_level = self.dropout_level,
                             nb_lemmas = nb_lemmas,
                            )
        self.save()
        self.setup = True

    def train(self, nb_epochs=None):
        if nb_epochs:
            self.nb_epochs = nb_epochs
        for i in range(self.nb_epochs):
            scores = self.epoch()
        return scores

    def print_stats(self):
        print('Train stats:')
        utils.stats(tokens=self.train_tokens, lemmas=self.train_lemmas, known=self.preprocessor.known_tokens)
        print('Test stats:')
        utils.stats(tokens=self.test_tokens, lemmas=self.test_lemmas, known=self.preprocessor.known_tokens)

    def test(self, multilabel_threshold=0.5):
        if not self.include_test:
            raise ValueError('Please do not call .test() if no test data is available.')

        score_dict = {}

        # get test predictions:
        test_in = {}
        if self.include_token:
            test_in['focus_in'] = self.test_X_focus
        if self.include_context:
            test_in['context_in'] = self.test_contexts

        test_preds = self.model.predict(test_in,
                                batch_size=self.batch_size)

        if isinstance(test_preds, np.ndarray):
            test_preds = [test_preds]

        if self.include_lemma:
            print('::: Test scores (lemmas) :::')
            
            pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=test_preds[self.lemma_out_idx])
            if self.postcorrect:
                for i in range(len(pred_lemmas)):
                    if pred_lemmas[i] not in self.known_lemmas:
                        pred_lemmas[i] = min(self.known_lemmas,
                                        key=lambda x: editdistance.eval(x, pred_lemmas[i]))
            score_dict['test_lemma'] = evaluation.single_label_accuracies(gold=self.test_lemmas,
                                                 silver=pred_lemmas,
                                                 test_tokens=self.test_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)

        if self.include_pos:
            print('::: Test scores (pos) :::')
            pred_pos = self.preprocessor.inverse_transform_pos(predictions=test_preds[self.pos_out_idx])
            score_dict['test_pos'] = evaluation.single_label_accuracies(gold=self.test_pos,
                                                 silver=pred_pos,
                                                 test_tokens=self.test_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)
        
        if self.include_morph:     
            print('::: Test scores (morph) :::')
            pred_morph = self.preprocessor.inverse_transform_morph(predictions=test_preds[self.morph_out_idx],
                                                                   threshold=multilabel_threshold)
            if self.include_morph == 'label':
                score_dict['test_morph'] = evaluation.single_label_accuracies(gold=self.test_morph,
                                                 silver=pred_morph,
                                                 test_tokens=self.test_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)                
            elif self.include_morph == 'multilabel':
                score_dict['test_morph'] = evaluation.multilabel_accuracies(gold=self.test_morph,
                                                 silver=pred_morph,
                                                 test_tokens=self.test_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)
        return score_dict

    def save(self):
        # save architecture:
        json_string = self.model.to_json()
        with open(os.sep.join((self.model_dir, 'model_architecture.json')), 'wb') as f:
            f.write(json_string)
        # save weights:
        self.model.save_weights(os.sep.join((self.model_dir, 'model_weights.hdf5')), \
                overwrite=True)
        # save preprocessor:
        with open(os.sep.join((self.model_dir, 'preprocessor.p')), 'wb') as f:
            pickle.dump(self.preprocessor, f)
        # save pretrainer:
        with open(os.sep.join((self.model_dir, 'pretrainer.p')), 'wb') as f:
            pickle.dump(self.pretrainer, f)
        if self.include_lemma:
            # save known lemmas:
            with open(os.sep.join((self.model_dir, 'known_lemmas.p')), 'wb') as f:
                pickle.dump(self.known_lemmas, f)
        # save config file:
        if self.config_path:
            # make sure that we can reproduce parametrization when reloading:
            if not self.config_path == os.sep.join((self.model_dir, 'config.txt')):
                shutil.copy(self.config_path, os.sep.join((self.model_dir, 'config.txt')))
        else:
            with open(os.sep.join((self.model_dir, 'config.txt')), 'w') as F:
                F.write('# Parameter file\n\n[global]\n')
                F.write('nb_encoding_layers = '+str(self.nb_encoding_layers)+'\n')
                F.write('nb_dense_dims = '+str(self.nb_dense_dims)+'\n')
                F.write('batch_size = '+str(self.batch_size)+'\n')
                F.write('nb_left_tokens = '+str(self.nb_left_tokens)+'\n')
                F.write('nb_right_tokens = '+str(self.nb_right_tokens)+'\n')
                F.write('nb_embedding_dims = '+str(self.nb_embedding_dims)+'\n')
                F.write('model_dir = '+str(self.model_dir)+'\n')
                F.write('postcorrect = '+str(self.postcorrect)+'\n')
                F.write('nb_filters = '+str(self.nb_filters)+'\n')
                F.write('filter_length = '+str(self.filter_length)+'\n')
                F.write('focus_repr = '+str(self.focus_repr)+'\n')
                F.write('dropout_level = '+str(self.dropout_level)+'\n')
                F.write('include_token = '+str(self.include_context)+'\n')
                F.write('include_context = '+str(self.include_context)+'\n')
                F.write('include_lemma = '+str(self.include_lemma)+'\n')
                F.write('include_pos = '+str(self.include_pos)+'\n')
                F.write('include_morph = '+str(self.include_morph)+'\n')
                F.write('include_dev = '+str(self.include_dev)+'\n')
                F.write('include_test = '+str(self.include_test)+'\n')
                F.write('nb_epochs = '+str(self.nb_epochs)+'\n')
                F.write('halve_lr_at = '+str(self.halve_lr_at)+'\n')
                F.write('max_token_len = '+str(self.max_token_len)+'\n')
                F.write('min_token_freq_emb = '+str(self.min_token_freq_emb)+'\n')
                F.write('min_lem_cnt = '+str(self.min_lem_cnt)+'\n')
        
        # plot current embeddings:
        if self.include_context:
            layer_dict = dict([(layer.name, layer) for layer in self.model.layers])
            weights = layer_dict['context_embedding'].get_weights()[0]
            X = np.array([weights[self.pretrainer.train_token_vocab.index(w), :] \
                    for w in self.pretrainer.mfi \
                      if w in self.pretrainer.train_token_vocab], dtype='float32')
            # dimension reduction:
            tsne = TSNE(n_components=2)
            coor = tsne.fit_transform(X) # unsparsify
            plt.clf(); sns.set_style('dark')
            sns.plt.rcParams['axes.linewidth'] = 0.4
            fig, ax1 = sns.plt.subplots()  
            labels = self.pretrainer.mfi
            # first plot slices:
            x1, x2 = coor[:,0], coor[:,1]
            ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none')
            # clustering on top (add some colouring):
            clustering = AgglomerativeClustering(linkage='ward',
                            affinity='euclidean', n_clusters=8)
            clustering.fit(coor)
            # add names:
            for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_):
                ax1.text(x, y, name, ha='center', va="center",
                         color=plt.cm.spectral(cluster_label / 10.),
                         fontdict={'family': 'Arial', 'size': 8})
            # control aesthetics:
            ax1.set_xlabel(''); ax1.set_ylabel('')
            ax1.set_xticklabels([]); ax1.set_xticks([])
            ax1.set_yticklabels([]); ax1.set_yticks([])
            sns.plt.savefig(os.sep.join((self.model_dir, 'embed_after.pdf')),
                            bbox_inches=0)

    def epoch(self, autosave=True):
        if not self.setup:
            raise ValueError('Not set up yet... Call Tagger.setup_() first.')

        # update nb of epochs ran so far:
        self.curr_nb_epochs += 1
        print("-> epoch ", self.curr_nb_epochs, "...")

        if self.curr_nb_epochs and self.halve_lr_at:
            # update learning rate at specific points:
            if self.curr_nb_epochs % self.halve_lr_at == 0:
                old_lr  = self.model.optimizer.lr.get_value()
                new_lr = np.float32(old_lr * 0.5)
                self.model.optimizer.lr.set_value(new_lr)
                print('\t- Lowering learning rate > was:', old_lr, ', now:', new_lr)

        # get inputs and outputs straight:
        train_in, train_out = {}, {}
        if self.include_token:
            train_in['focus_in'] = self.train_X_focus
        if self.include_context:
            train_in['context_in'] = self.train_contexts

        if self.include_lemma:
            train_out['lemma_out'] = self.train_X_lemma
        if self.include_pos:
            train_out['pos_out'] = self.train_X_pos
        if self.include_morph:
            train_out['morph_out'] = self.train_X_morph
        
        self.model.fit(train_in, train_out,
              nb_epoch = 1,
              shuffle = True,
              batch_size = self.batch_size)

        # get train preds:
        train_preds = self.model.predict(train_in,
                                batch_size=self.batch_size)
        if isinstance(train_preds, np.ndarray):
            train_preds = [train_preds]

        if self.include_dev:
            dev_in = {}
            if self.include_token:
                dev_in['focus_in'] = self.dev_X_focus
            if self.include_context:
                dev_in['context_in'] = self.dev_contexts

            dev_preds = self.model.predict(dev_in,
                                    batch_size=self.batch_size)
            if isinstance(dev_preds, np.ndarray):
                dev_preds = [dev_preds]

        score_dict = {}
        if self.include_lemma:
            print('::: Train scores (lemmas) :::')
            pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=train_preds[self.lemma_out_idx])
            score_dict['train_lemma'] = evaluation.single_label_accuracies(gold=self.train_lemmas,
                                                 silver=pred_lemmas,
                                                 test_tokens=self.train_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)
            if self.include_dev:
                print('::: Dev scores (lemmas) :::')
                pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=dev_preds[self.lemma_out_idx])
                score_dict['dev_lemma'] = evaluation.single_label_accuracies(gold=self.dev_lemmas,
                                                     silver=pred_lemmas,
                                                     test_tokens=self.dev_tokens,
                                                     known_tokens=self.preprocessor.known_tokens)
                
                if self.postcorrect:
                    print('::: Dev scores (lemmas) -> postcorrected :::')
                    for i in range(len(pred_lemmas)):
                        if pred_lemmas[i] not in self.known_lemmas:
                            pred_lemmas[i] = min(self.known_lemmas,
                                            key=lambda x: editdistance.eval(x, pred_lemmas[i]))
                    score_dict['dev_lemma_postcorrect'] = evaluation.single_label_accuracies(gold=self.dev_lemmas,
                                                     silver=pred_lemmas,
                                                     test_tokens=self.dev_tokens,
                                                     known_tokens=self.preprocessor.known_tokens)

        if self.include_pos:
            print('::: Train scores (pos) :::')
            pred_pos = self.preprocessor.inverse_transform_pos(predictions=train_preds[self.pos_out_idx])
            score_dict['train_pos'] = evaluation.single_label_accuracies(gold=self.train_pos,
                                                 silver=pred_pos,
                                                 test_tokens=self.train_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)
            if self.include_dev:
                print('::: Dev scores (pos) :::')
                pred_pos = self.preprocessor.inverse_transform_pos(predictions=dev_preds[self.pos_out_idx])
                score_dict['dev_pos'] = evaluation.single_label_accuracies(gold=self.dev_pos,
                                                     silver=pred_pos,
                                                     test_tokens=self.dev_tokens,
                                                     known_tokens=self.preprocessor.known_tokens)
        
        if self.include_morph:
            print('::: Train scores (morph) :::')
            pred_morph = self.preprocessor.inverse_transform_morph(predictions=train_preds[self.morph_out_idx])
            if self.include_morph == 'label':
                score_dict['train_morph'] = evaluation.single_label_accuracies(gold=self.train_morph,
                                                 silver=pred_morph,
                                                 test_tokens=self.train_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)
            elif self.include_morph == 'multilabel':
                score_dict['train_morph'] = evaluation.multilabel_accuracies(gold=self.train_morph,
                                                 silver=pred_morph,
                                                 test_tokens=self.train_tokens,
                                                 known_tokens=self.preprocessor.known_tokens)


            if self.include_dev:
                print('::: Dev scores (morph) :::')
                pred_morph = self.preprocessor.inverse_transform_morph(predictions=dev_preds[self.morph_out_idx])
                if self.include_morph == 'label':
                    score_dict['dev_morph'] = evaluation.single_label_accuracies(gold=self.train_morph,
                                                     silver=pred_morph,
                                                     test_tokens=self.dev_tokens,
                                                     known_tokens=self.preprocessor.known_tokens)
                elif self.include_morph == 'multilabel':
                    score_dict['dev_morph'] = evaluation.multilabel_accuracies(gold=self.train_morph,
                                                     silver=pred_morph,
                                                     test_tokens=self.dev_tokens,
                                                     known_tokens=self.preprocessor.known_tokens)

        if autosave:
            self.save()
        
        return score_dict

    def annotate(self, tokens):
        X_focus = self.preprocessor.transform(tokens=tokens)['X_focus']
        X_context = self.pretrainer.transform(tokens=tokens)
        
        # get predictions:
        new_in = {}
        if self.include_token:
            new_in['focus_in'] = X_focus
        if self.include_context:
            new_in['context_in'] = X_context
        preds = self.model.predict(new_in)

        if isinstance(preds, np.ndarray):
            preds = [preds]
        
        annotation_dict = {'tokens': tokens}
        if self.include_lemma:
            pred_lemmas = self.preprocessor.inverse_transform_lemmas(predictions=preds[self.lemma_out_idx])
            annotation_dict['lemmas'] = pred_lemmas
            if self.postcorrect:
                for i in range(len(pred_lemmas)):
                    if pred_lemmas[i] not in self.known_lemmas:
                        pred_lemmas[i] = min(self.known_lemmas,
                                            key=lambda x: editdistance.eval(x, pred_lemmas[i]))
                annotation_dict['postcorrect_lemmas'] = pred_lemmas

        if self.include_pos:
            pred_pos = self.preprocessor.inverse_transform_pos(predictions=preds[self.pos_out_idx])
            annotation_dict['pos'] = pred_pos
        
        if self.include_morph:
            pred_morph = self.preprocessor.inverse_transform_morph(predictions=preds[self.morph_out_idx])
            annotation_dict['morph'] = pred_morph

        return annotation_dict
Exemplo n.º 35
0
    def setup_to_train(self, train_data=None, dev_data=None, test_data=None):
        # create a model directory:
        if os.path.isdir(self.model_dir):
            shutil.rmtree(self.model_dir)
        os.mkdir(self.model_dir)

        self.train_tokens = train_data['token']
        if self.include_test:
            self.test_tokens = test_data['token']
        if self.include_dev:
            self.dev_tokens = dev_data['token']

        idx_cnt = 0
        if self.include_lemma:
            self.lemma_out_idx = idx_cnt
            idx_cnt += 1
            self.train_lemmas = train_data['lemma']
            self.known_lemmas = set(self.train_lemmas)
            if self.include_dev:
                self.dev_lemmas = dev_data['lemma']            
            if self.include_test:
                self.test_lemmas = test_data['lemma']
        if self.include_pos:
            self.pos_out_idx = idx_cnt
            idx_cnt += 1
            self.train_pos = train_data['pos']
            if self.include_dev:
                self.dev_pos = dev_data['pos']
            if self.include_test:
                self.test_pos = test_data['pos']
        if self.include_morph:
            self.morph_out_idx = idx_cnt
            self.train_morph = train_data['morph']
            if self.include_dev:
                self.dev_morph = dev_data['morph']
            if self.include_test:
                self.test_morph = test_data['morph']

        self.preprocessor = Preprocessor().fit(tokens=self.train_tokens,
                                               lemmas=self.train_lemmas,
                                               pos=self.train_pos,
                                               morph=self.train_morph,
                                               include_lemma=self.include_lemma,
                                               include_morph=self.include_morph,
                                               max_token_len=self.max_token_len,
                                               focus_repr=self.focus_repr,
                                               min_lem_cnt=self.min_lem_cnt,
                                               )
        self.pretrainer = Pretrainer(nb_left_tokens=self.nb_left_tokens,
                                     nb_right_tokens=self.nb_right_tokens,
                                     size=self.nb_embedding_dims,
                                     minimum_count=self.min_token_freq_emb)
        self.pretrainer.fit(tokens=self.train_tokens)

        train_transformed = self.preprocessor.transform(tokens=self.train_tokens,
                                               lemmas=self.train_lemmas,
                                               pos=self.train_pos,
                                               morph=self.train_morph)
        if self.include_dev:
            dev_transformed = self.preprocessor.transform(tokens=self.dev_tokens,
                                        lemmas=self.dev_lemmas,
                                        pos=self.dev_pos,
                                        morph=self.dev_morph)
        if self.include_test:
            test_transformed = self.preprocessor.transform(tokens=self.test_tokens,
                                        lemmas=self.test_lemmas,
                                        pos=self.test_pos,
                                        morph=self.test_morph)

        self.train_X_focus = train_transformed['X_focus']
        if self.include_dev:
            self.dev_X_focus = dev_transformed['X_focus']
        if self.include_test:
            self.test_X_focus = test_transformed['X_focus']

        if self.include_lemma:
            self.train_X_lemma = train_transformed['X_lemma']
            if self.include_dev:
                self.dev_X_lemma = dev_transformed['X_lemma']
            if self.include_test:
                self.test_X_lemma = test_transformed['X_lemma']

        if self.include_pos:
            self.train_X_pos = train_transformed['X_pos']
            if self.include_dev:
                self.dev_X_pos = dev_transformed['X_pos']
            if self.include_test:
                self.test_X_pos = test_transformed['X_pos']

        if self.include_morph:
            self.train_X_morph = train_transformed['X_morph']
            if self.include_dev:
                self.dev_X_morph = dev_transformed['X_morph']
            if self.include_test:
                self.test_X_morph = test_transformed['X_morph']

        self.train_contexts = self.pretrainer.transform(tokens=self.train_tokens)
        if self.include_dev:
            self.dev_contexts = self.pretrainer.transform(tokens=self.dev_tokens)
        if self.include_test:
            self.test_contexts = self.pretrainer.transform(tokens=self.test_tokens)
        
        print('Building model...')
        nb_tags = None
        try:
            nb_tags = len(self.preprocessor.pos_encoder.classes_)
        except AttributeError:
            pass
        nb_morph_cats = None
        try:
            nb_morph_cats = self.preprocessor.nb_morph_cats
        except AttributeError:
            pass
        max_token_len, token_char_dict = None, None
        try:
            max_token_len = self.preprocessor.max_token_len
            token_char_dict = self.preprocessor.token_char_dict
        except AttributeError:
            pass
        max_lemma_len, lemma_char_dict = None, None
        try:
            max_lemma_len = self.preprocessor.max_lemma_len
            lemma_char_dict = self.preprocessor.lemma_char_dict
        except AttributeError:
            pass
        nb_lemmas = None
        try:
            nb_lemmas = len(self.preprocessor.lemma_encoder.classes_)
        except AttributeError:
            pass
        self.model = build_model(token_len=max_token_len,
                             token_char_vector_dict=token_char_dict,
                             lemma_len=max_lemma_len,
                             nb_tags=nb_tags,
                             nb_morph_cats=nb_morph_cats,
                             lemma_char_vector_dict=lemma_char_dict,
                             nb_encoding_layers=self.nb_encoding_layers,
                             nb_dense_dims=self.nb_dense_dims,
                             nb_embedding_dims=self.nb_embedding_dims,
                             nb_train_tokens=len(self.pretrainer.train_token_vocab),
                             nb_context_tokens=self.nb_context_tokens,
                             pretrained_embeddings=self.pretrainer.pretrained_embeddings,
                             include_token=self.include_token,
                             include_context=self.include_context,
                             include_lemma=self.include_lemma,
                             include_pos=self.include_pos,
                             include_morph=self.include_morph,
                             nb_filters = self.nb_filters,
                             filter_length = self.filter_length,
                             focus_repr = self.focus_repr,
                             dropout_level = self.dropout_level,
                             nb_lemmas = nb_lemmas,
                            )
        self.save()
        self.setup = True
Exemplo n.º 36
0
	plot = False

	# Init variables
	print("Init all variables")
	coords = np.loadtxt("patch_coordinates.txt", delimiter="\t", skiprows=1)
	coords = np.multiply(coords, scale_factor)
	patches = {'white':coords[:,0:2],
		   	   'brown':coords[:,2:4],
		   	   'gray':coords[:,4:6],
		       'green':coords[:,6:8]}

	box_size *= scale_factor
	
	# Load Preprocessor
	print("Preprocessing")
	p = Preprocessor("../images/slum_image.jpg")
	p.scale_image(scale_factor)
	p.save_current_as("normal")

	p.exposure_equalization(method="contrast")
	p.convert_color("RGB","RGB CIE")
	p.save_current_as("contrast_rgb_cie")

	p.reset()
	p.scale_image(scale_factor)
	p.exposure_equalization(method="equal")
	p.convert_color("RGB","HSV")
	p.save_current_as("structure")

	# ========== Plot img & patches =========
	if plot:
Exemplo n.º 37
0
def plot_intervals(output_folder):
    from parsers import CVOutputParser
    from preprocessing import Preprocessor
    from utils import avg
    import os
    import math
    """ 
    Given a cross validation ouput. Certain triple intervals can be plottet
    to compare the error for extrapolation, max ent and the heurestic.
    
    The algorithm runs through each triple interval, and then for each sampled estiamte output
    the triples in the interval are looked up in each sample and the MAPE error is 
    recorded and the average errors are added. And the average of these averages
    are then plottet for each interval.

    """
    if not output_folder[-1] == '/':
        output_folder += '/'
    intervals = 30
    triple_intervals = Preprocessor.triple_intervals(output_folder + 'observed_frequent_items.out', intervals=intervals)

    avg_max_ent_errors = []
    avg_ext_errors = []
    avg_heu_errors = []
    pair_triple_ratios = [i/10. for i in range(11)] # binned ratios [0.0 to 1.0]
    max_ent_ratio_error = [0 for i in range(11)]
    ext_ratio_error = [0 for i in range(11)]

    for index, triple_interval in enumerate(triple_intervals):
        print 'Triple interval {} of {}'.format(index, intervals)
        iteration = 0
        MAPE_avg_errors = []
        MAPE_avg_errors_ext = []
        # MAPE_avg_errors_heu = []
        while True:
            max_ent_est_file = output_folder + str(iteration) + '_data.tsv'
            ext_est_file = output_folder + str(iteration) + '_data_extrapolation.tsv'
            # heu_est_file = output_folder + str(iteration) + '_data_heurestic.tsv'
            # read baseline also?
            # Read until we do not find an output file
            if not os.path.exists(max_ent_est_file):
                break

            max_ent_est = CVOutputParser.read_est_obs_file(max_ent_est_file)
            ext_est = CVOutputParser.read_est_obs_file(ext_est_file)
            # heu_est = CVOutputParser.read_est_obs_file(heu_est_file)

            MAPE_errors = []
            MAPE_errors_ext = []
            # MAPE_errors_heu = []

            for triple in triple_interval:
                # Check that the triple has been estimated
                if triple in max_ent_est:

                    # Index 1 should hold the observed value parsed from the file
                    # is the same mapped to every estimate, so hust read it once.
                    obs = max_ent_est[triple][1]

                    # maxent estimate
                    est = max_ent_est[triple][0]

                    # extrapolation estimate
                    est2 = ext_est[triple][0]

                    # # independence estimat?

                    # heurestic, use max_ent for 0 triple in sample
                    # est4 = heu_est[triple][0]

                    # Index 2 should hold the pair triple ratio.
                    # is the sam for every estimat
                    ratio = max_ent_est[triple][2]
                    # bin the ratio to one decimal
                    ratio_binned = round(ratio, 1)
                    # add errors to the ratio
                    max_ent_ratio_error[pair_triple_ratios.index(ratio_binned)] += abs(est-obs) / float(obs)
                    ext_ratio_error[pair_triple_ratios.index(ratio_binned)] += abs(est2-obs) / float(obs)


                    # MAPE error max ent
                    # error = abs(obs-est) #/ float(obs) * 100
                    # MAPE_errors.append(error)

                    # # MAPE error extrapolation
                    # error2 = abs(obs-est2) #/ float(obs) * 100
                    # MAPE_errors_ext.append(error2)

                    # MAPE error independence?

                    # MAPE error heurestic
                    # error4 = abs(obs-est4) #/ float(obs) * 100
                    # MAPE_errors_heu.append(error4)

                    

                    # MAPE baseline error?
            MAPE_avg_errors.append(avg(MAPE_errors))
            MAPE_avg_errors_ext.append(avg(MAPE_errors_ext))
            # MAPE_avg_errors_heu.append(avg(MAPE_errors_heu))
            iteration += 1

        avg_max_ent_errors.append(avg(MAPE_avg_errors))
        avg_ext_errors.append(avg(MAPE_avg_errors_ext))
        # avg_heu_errors.append(avg(MAPE_avg_errors_heu))
        

    plot(range(len(avg_max_ent_errors)), avg_max_ent_errors, color='blue')
    plot(range(len(avg_ext_errors)), avg_ext_errors, color='red')