示例#1
0
class FeaturesPerceptronRanker(BasePerceptronRanker):
    """Base class for global ranker for whole trees, based on features."""
    def __init__(self, cfg):
        super(FeaturesPerceptronRanker, self).__init__(cfg)
        if not cfg:
            cfg = {}
        self.feats = ['bias: bias']
        self.vectorizer = None
        self.normalizer = None
        self.binarize = cfg.get('binarize', False)
        # initialize feature functions
        if 'features' in cfg:
            self.feats.extend(cfg['features'])
        self.feats = Features(self.feats, cfg.get('intermediate_features', []))

    def _extract_feats(self, tree, da):
        feats = self.vectorizer.transform(
            [self.feats.get_features(tree, {'da': da})])
        if self.normalizer:
            feats = self.normalizer.transform(feats)
        return feats[0]

    def _init_training(self, das_file, ttree_file, data_portion):

        super(FeaturesPerceptronRanker,
              self)._init_training(das_file, ttree_file, data_portion)

        # precompute training data features
        X = []
        for da, tree in zip(self.train_das, self.train_trees):
            X.append(self.feats.get_features(tree, {'da': da}))
        if self.prune_feats > 1:
            self._prune_features(X)
        # vectorize and binarize or normalize (+train vectorizer/normalizer)
        if self.binarize:
            self.vectorizer = DictVectorizer(sparse=False,
                                             binarize_numeric=True)
            self.train_feats = self.vectorizer.fit_transform(X)
        else:
            self.vectorizer = DictVectorizer(sparse=False)
            self.normalizer = StandardScaler(copy=False)
            self.train_feats = self.normalizer.fit_transform(
                self.vectorizer.fit_transform(X))

        log_info('Features matrix shape: %s' % str(self.train_feats.shape))

    def _prune_features(self, X):
        """Prune features – remove all entries from X that involve features not having a
        specified minimum occurrence count.
        """
        counts = defaultdict(int)
        for inst in X:
            for key in inst.iterkeys():
                counts[key] += 1
        for inst in X:
            for key in inst.keys():
                if counts[key] < self.prune_feats:
                    del inst[key]
示例#2
0
文件: rank.py 项目: UFAL-DSG/tgen
class FeaturesPerceptronRanker(BasePerceptronRanker):
    """Base class for global ranker for whole trees, based on features."""

    def __init__(self, cfg):
        super(FeaturesPerceptronRanker, self).__init__(cfg)
        if not cfg:
            cfg = {}
        self.feats = ['bias: bias']
        self.vectorizer = None
        self.normalizer = None
        self.binarize = cfg.get('binarize', False)
        # initialize feature functions
        if 'features' in cfg:
            self.feats.extend(cfg['features'])
        self.feats = Features(self.feats, cfg.get('intermediate_features', []))

    def _extract_feats(self, tree, da):
        feats = self.vectorizer.transform([self.feats.get_features(tree, {'da': da})])
        if self.normalizer:
            feats = self.normalizer.transform(feats)
        return feats[0]

    def _init_training(self, das_file, ttree_file, data_portion):

        super(FeaturesPerceptronRanker, self)._init_training(das_file, ttree_file, data_portion)

        # precompute training data features
        X = []
        for da, tree in zip(self.train_das, self.train_trees):
            X.append(self.feats.get_features(tree, {'da': da}))
        if self.prune_feats > 1:
            self._prune_features(X)
        # vectorize and binarize or normalize (+train vectorizer/normalizer)
        if self.binarize:
            self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True)
            self.train_feats = self.vectorizer.fit_transform(X)
        else:
            self.vectorizer = DictVectorizer(sparse=False)
            self.normalizer = StandardScaler(copy=False)
            self.train_feats = self.normalizer.fit_transform(self.vectorizer.fit_transform(X))

        log_info('Features matrix shape: %s' % str(self.train_feats.shape))

    def _prune_features(self, X):
        """Prune features – remove all entries from X that involve features not having a
        specified minimum occurrence count.
        """
        counts = defaultdict(int)
        for inst in X:
            for key in inst.iterkeys():
                counts[key] += 1
        for inst in X:
            for key in inst.keys():
                if counts[key] < self.prune_feats:
                    del inst[key]
示例#3
0
def train_ctc_model(train_file, test_file):
    """ Function of training Code Recognizer """

    # training and test dataset (default)
    train_file = parameters_ctc['train_file']
    test_file = parameters_ctc['test_file']
    
    # extract features from two language models trained on Gigaword and StackOverflow
    features = Features(RESOURCES)
    train_tokens, train_features, train_labels = features.get_features(train_file, True)
    test_tokens, test_features, test_labels = features.get_features(test_file, False)
    
    # fastText embedding
    vocab_size, word_to_id, id_to_word, word_to_vec = get_word_dict_pre_embeds(train_file, test_file)
    train_ids, test_ids = get_train_test_word_id(train_file, test_file,  word_to_id)
    
    # transform each ngram probability into a k-dimensional vector using Gaussian binning
    word_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (vocab_size, parameters_ctc['word_dim']))
    for word in word_to_vec:
        word_embeds[word_to_id[word]]=word_to_vec[word]
    
    # concatenate the outputs with fastText embedding
    ctc_classifier = NeuralClassifier(len(train_features[0]), max(train_labels) + 1, vocab_size, word_embeds)
    ctc_classifier.to(device)
    
    # binary classifier
    optimizer = torch.optim.Adam(ctc_classifier.parameters(), lr=parameters_ctc["LR"])
    step_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)
    
    # prepare dataset
    train_x = Variable(torch.FloatTensor(train_features).to(device))
    train_x_words = Variable(torch.LongTensor(train_ids).to(device))
    train_y = Variable(torch.LongTensor(train_labels).to(device))

    test_x = Variable(torch.FloatTensor(test_features).to(device))
    test_x_words = Variable(torch.LongTensor(test_ids).to(device))
    test_y = Variable(torch.LongTensor(test_labels).to(device))

    # training
    for epoch in range(parameters_ctc['epochs']):
        loss = ctc_classifier.CrossEntropy(train_features, train_x_words, train_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_scores,  train_preds = ctc_classifier(train_features, train_x_words)
        test_scores, test_preds = ctc_classifier(test_features, test_x_words)
        
        eval(test_preds, test_labels, "test")

    return ctc_classifier, vocab_size, word_to_id, id_to_word, word_to_vec, features
def get_data():
    files = os.listdir('./MealNoMealData')
    meal_data_files = []
    no_meal_data_files = []
    for file in files:
        if 'Nomeal' in file:
            no_meal_data_files.append(os.path.join('./MealNoMealData', file))
        else:
            meal_data_files.append(os.path.join('./MealNoMealData', file))

    data = []

    labels = []
    for meal_data_file, no_meal_data_file in zip(meal_data_files,
                                                 no_meal_data_files):

        preprocess_obj = Preprocess(meal_data_file)
        meal_df = preprocess_obj.get_dataframe()
        meal_features = Features(meal_df)
        meal_features.compute_features()
        # temp_meal_features = meal_features.pca_decomposition().tolist()
        temp_meal_features = meal_features.get_features()
        labels += [1] * len(temp_meal_features)

        preprocess_obj_ = Preprocess(no_meal_data_file)
        no_meal_df = preprocess_obj_.get_dataframe()
        no_meal_features = Features(no_meal_df)
        no_meal_features.compute_features()
        no_meal_features_ = no_meal_features.get_features()
        # no_meal_final_features = meal_features.pca.transform(no_meal_features_).tolist()
        no_meal_final_features = no_meal_features_
        labels += [0] * len(no_meal_features_)

        for no_meal_feature in no_meal_final_features:
            temp_meal_features.append(no_meal_feature)

        for meal_no_meal_feature in temp_meal_features:
            data.append(meal_no_meal_feature)

    return data, labels
示例#5
0
 def build_dataset(self, selected_datasets):
     features = Features.get_features(self.data_dir, self.features_type)
     self.datasets = {t: [] for t in self.all_tasks}
     with open(self.filename) as f:
         f.readline()
         for line in f:
             values = line.strip().split()
             utt_id = values[0]
             dataset = values[1].strip().lower()
             raw_item = {k: v.lower() for k, v in zip(self.data_tasks, values[3:])}
             if utt_id in features:
                 if dataset in selected_datasets:
                     self._add_record(raw_item, features[utt_id], dataset)
             else:
                 print("Utterance does not have features!!! utt_id: {}".format(utt_id))
示例#6
0
def lambda_handler(event, context):
    # TODO implement

    json_data = json.loads(event['body'])
    preprocess = Preprocess(json_data=json_data)
    preprocess.scale_points(calculate_scale=False)

    pose_objects = preprocess.new_pose_objects

    features = []

    features_obj = Features(pose_objects=pose_objects)
    features_obj.compute_features()
    features = features_obj.get_features()
    # pca_model = pickle.load(open('pca.pkl', 'rb'))
    # reduced_feature_matrix = pca_model.transform(features)

    s3 = boto3.resource('s3')

    svm_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("SVM_model.pkl").get()
        ['Body'].read())

    logreg_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("LogReg_model.pkl").get()
        ['Body'].read())

    lda_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("LDA_model.pkl").get()
        ['Body'].read())

    random_forest_classifier = pickle.loads(
        s3.Bucket("gesture-recognition").Object("RForest_model.pkl").get()
        ['Body'].read())

    prediction_rf = random_forest_classifier.predict(features)
    prediction_svm = svm_classifier.predict(features)
    prediction_lda = lda_classifier.predict(features)
    prediction_logreg = logreg_classifier.predict(features)

    data = {
        "1": prediction_svm[0],
        "2": prediction_logreg[0],
        "3": prediction_lda[0],
        "4": prediction_rf[0]
    }
    return {'statusCode': 200, 'body': json.dumps(data)}
def get_prediction(dict_files: Dict[str, str], model: str) -> Dict[str, int]:
    """
    Function to get the prediction of the files
    :parameter dict_files will contain the file paths 
    :parameter model will contain the specified model for prediction
    :attrib pred will contain a dictionary with the prediction
    This function will return the prediction in a dictionary

    """
    pred = {}
    for key, value in dict_files.items():
        print(value)
        data = Features.get_features(value)
        df = pd.DataFrame([data])
        print(f"Predicting for {key}...")
        prediction = predict(df, model)
        pred[key] = prediction
    return pred
preprocess = Preprocess()
preprocess.scale_points()

pose_objects = preprocess.new_pose_objects

features = []

features_obj = Features(pose_objects=pose_objects)
features_obj.compute_features()
# reduced_feature_matrix = features_obj.compute_pca()

# print(reduced_feature_matrix)
# print(len(reduced_feature_matrix),len(reduced_feature_matrix[0]))

# X = reduced_feature_matrix
X = features_obj.get_features()
Y = [obj.label for obj in pose_objects]

print(len(X), len(Y))
clf_rforest = Classification('RForest', X, Y)
clf_rforest.get_classifier_object()
clf_rforest.get_metrics()
pickle.dump(clf_rforest.get_classifier(), open('RForest_model.pkl', 'wb'))
print()

clf_svm = Classification('svm', X, Y)
clf_svm.get_classifier_object()
clf_svm.get_metrics()
pickle.dump(clf_svm.get_classifier(), open('SVM_model.pkl', 'wb'))
print()
示例#9
0
import pickle
from preprocessing import Preprocess
from features import Features
import numpy as np
import pandas as pd


test_file_name = input("Please enter the test file name: ")
preprocess_obj = Preprocess(test_file_name)
test_file_dataframe = preprocess_obj.get_dataframe()
test_file_features_obj = Features(test_file_dataframe)
test_file_features_obj.compute_features()
test_file_features = test_file_features_obj.get_features()
# print(len(test_file_features))

# Random Forest
random_forest_clf = pickle.load(open('RForest_model.pkl', 'rb'))
y_pred = random_forest_clf.predict(test_file_features)
print('Saving the output of RandomForest classifier prediction')
rforest_dataframe = pd.DataFrame(y_pred, columns=['Meal/NoMeal'])
rforest_dataframe.to_csv('RForest_output.csv')

# AdaBoost
adaboost_clf = pickle.load(open('Adaboost_model.pkl', 'rb'))
y_pred = adaboost_clf.predict(test_file_features)
print('Saving the output of AdaBoost classifier prediction')
adaboost_dataframe = pd.DataFrame(y_pred, columns=['Meal/NoMeal'])
adaboost_dataframe.to_csv('Adaboost_output.csv')

# XGBoost
XGBoost_clf = pickle.load(open('XGBoost_model.pkl', 'rb'))
示例#10
0
class Agent:
    # name should contain only letters, digits, and underscores (not enforced by environment)
    __name = 'Based_Agent'

    def __init__(self, stateDim, actionDim, agentParams):
        self.__stateDim = stateDim
        self.__actionDim = actionDim
        self.__action = np.random.random(actionDim)
        self.__step = 0

        self.__alpha = 0.001
        self.__gamma = 0.9
        self.__decision_every = 6
        self.__explore_probability = 0.2
        self.__max_replay_samples = 20

        self.__features = Features()
        self.__previous_action = None
        self.__current_out = None
        self.__previous_out = None
        self.__previous_meta_state = None
        self.__previous_state = None

        self.__test = agentParams[0] if agentParams else None
        self.__exploit = False

        self.__segments = 2
        self.__actions = 3**self.__segments

        try:
            self.__net = load_model('net')
        except:
            print('Creating new model')
            self.__net = Sequential([
                Dense(50, activation='elu', input_dim=self.__features.dim),
                Dense(30, activation='elu'),
                Dense(self.__actions),
                Reshape((self.__actions, 1))
            ])

        self.__net.compile(optimizer=SGD(lr=self.__alpha), loss='mean_squared_error', sample_weight_mode='temporal')

        try:
            self.__replay = Replay.load('replay')
        except Exception as a:
            self.__replay = Replay(self.__actions)

        self.__replay_X = []
        self.__replay_Y = []

    def start(self, state):
        self.__previous_state = state

        self.__choose_action(state)

        self.__previous_out = self.__current_out

        return self.__action

    def step(self, reward, state):
        self.__previous_state = state

        self.__step += 1
        if self.__step % self.__decision_every != 0:
            return self.__action

        self.__choose_action(state)

        if not self.__exploit:
            max_q = self.__current_out[np.argmax(self.__current_out)]
            self.__update_q(reward - self.__features.min_dist(state) / 100, max_q)

        self.__previous_out = self.__current_out

        return self.__action

    def end(self, reward):
        if not self.__exploit:
            self.__update_q(reward, reward)
            self.__replay.submit(self.__test, (self.__replay_X, self.__replay_Y), self.__step)
            self.__net.save('net')
            self.__replay.save('replay')

    def cleanup(self):
        pass

    def getName(self):
        return self.__name

    def __choose_action(self, state):
        meta_state = np.asarray(self.__features.get_features(state), dtype='float').reshape((1, self.__features.dim))
        out = self.__net.predict_proba([meta_state], batch_size=1)[0].flatten()

        self.__current_out = out

        if self.__exploit or self.__explore_probability < np.random.random():
            # take best action
            action = np.argmax(out)
        else:
            # take random action
            action = np.random.randint(0, self.__actions)

        self.__previous_action = action
        self.__previous_meta_state = meta_state

        self.__meta_to_action(action)

    def __update_q(self, reward, max_q):
        teach_out = self.__previous_out
        teach_out[self.__previous_action] = reward + self.__gamma * max_q

        # sampling from infinite stream
        if len(self.__replay_X) < self.__max_replay_samples:
            self.__replay_X.append(self.__previous_meta_state)
            self.__replay_Y.append((teach_out[self.__previous_action], self.__previous_action))

        elif np.random.random() < self.__max_replay_samples/self.__step:
            to_replace = np.random.randint(0, self.__max_replay_samples)
            self.__replay_X[to_replace] = self.__previous_meta_state
            self.__replay_Y[to_replace] = (teach_out[self.__previous_action], self.__previous_action)

        self.__net.fit([self.__previous_meta_state], [teach_out.reshape(1, self.__actions, 1)], verbose=0)

        replay_x, replay_y, replay_w = self.__replay.get_training()
        if replay_x:
            data = list(zip(replay_x, replay_y, replay_w))
            np.random.shuffle(data)
            for x, y, w in data:
                self.__net.fit([x], [y], sample_weight=[w], verbose=0)

    def __meta_to_action(self, meta):

        self.__action[:] = 0

        for segment in range(self.__segments):
            segment_action = meta % 3

            muscle_start = 30 * segment // self.__segments
            muscle_stop = 30 * (segment+1) // self.__segments

            if segment_action == 0:
                self.__action[muscle_start:muscle_stop:3] = 1

            if segment_action == 1:
                self.__action[muscle_start+1:muscle_stop:3] = 1

            if segment_action == 2:
                self.__action[muscle_start+2:muscle_stop:3] = 1

            meta //= 3