示例#1
0
 def binarizelabels(self, labels, nclasses=None):
     if nclasses == None:
         mlb = preprocessing.MultiLabelBinarizer()
         return mlb.fit_transform(labels)
     # for fit_and_predict to return binarized object of predicted classes
     mlb = preprocessing.MultiLabelBinarizer(classes=range(nclasses))
     return mlb.fit_transform(labels)
def example():
    from sklearn import preprocessing
    lb = preprocessing.LabelBinarizer()
    lb.fit([1, 2, 6, 4, 2])

    print(lb.classes_)
    print(lb.transform([1, 6]))

    #######################################
    lb = preprocessing.MultiLabelBinarizer()
    lb.fit_transform([(1, 2), (3, )])

    print(lb.classes_)
    ########################################
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit([1, 2, 2, 6])

    print(le.classes_)
    print(le.transform([1, 1, 2, 6]))
    print(le.inverse_transform([0, 0, 1, 2]))

    #########################################
    le = preprocessing.LabelEncoder()
    le.fit(["paris", "paris", "tokyo", "amsterdam"])

    print(list(le.classes_))
    print(le.transform(["tokyo", "tokyo", "paris"]))
    print(list(le.inverse_transform([2, 2, 1])))
示例#3
0
def analyse_tweet_ml(tweet):
    df = pd.read_csv('tweets.csv')
    df.isnull().any()

    message = tweet

    X_train = df.tweet

    y_train = df.label.astype(str)

    lb = preprocessing.MultiLabelBinarizer()
    y_train = lb.fit_transform(y_train)

    X_test = np.array([message])

    # ML Pipeline
    classifier = Pipeline([('vectorizer',
                            CountVectorizer(max_df=0.5, ngram_range=(1, 2))),
                           ('tfidf', TfidfTransformer()),
                           ('clf', OneVsRestClassifier(LinearSVC()))])
    classifier.fit(X_train, y_train)
    predicted = classifier.predict(X_test)

    print predicted[0]

    count = 0
    label = ""
    for i in predicted[0]:
        if i == 1:
            label = count
        count = count + 1

    return label
示例#4
0
    def _encode_labels(self):
        """Encodes y labels using sklearn to create allow for string or numeric inputs"""
        mlb = preprocessing.MultiLabelBinarizer()
        mlb.fit(self.y)
        mapping_dict = dict(zip(list(range(0, len(mlb.classes_))), mlb.classes_))

        return mlb, mapping_dict
示例#5
0
def get_top_dataset():
    lb = preprocessing.MultiLabelBinarizer(top_categories)
    train_data, train_targets = get_top_split_set("train")
    test_data, test_targets = get_top_split_set("test")
    train_targets = lb.fit_transform(train_targets)
    test_targets = lb.transform(test_targets)
    return np.array(train_data), np.array(train_targets), np.array(test_data), np.array(test_targets)
def oneVsRest_LogReg_TfIdf(X_train, X_test, Y_train, Y_test, word_dict, tags_dict, data_files, test_doc_ids ):
  print('Processing : oneVsRest_LogReg_TfIdf')
  print('-'*50)

  Y_original = Y_test
  vectorizer = CountVectorizer(min_df=1, vocabulary=word_dict)
  X_v_train = vectorizer.fit_transform(X_train)
  X_v_test = vectorizer.fit_transform(X_test)
  transformer = TfidfTransformer(smooth_idf=False)
  X_train_tf = transformer.fit_transform(X_v_train)
  X_test_tf = transformer.fit_transform(X_v_test)

  uniq_tags_names = list(tags_dict.keys())
  mlb = preprocessing.MultiLabelBinarizer(classes=uniq_tags_names)
  Y_train = mlb.fit_transform(Y_train)
  Y_test = mlb.fit_transform(Y_test)

  classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=0.01))
  classifier.fit(X_train_tf, Y_train)
  score = classifier.score(X_test_tf, Y_test)
  print('-' * 50)
  print('Score oneVsRest_LogReg_TfIdf : {}'.format(score))
  print('-' * 50)
  Y_pred = classifier.predict(X_v_test)
  Y_back = mlb.inverse_transform(Y_pred)
  write_to_file(Y_original, Y_back, 'oneVsRest_LogREg', score, data_files, test_doc_ids)
示例#7
0
    def fit(self, X, y, log_run=True):
        """Fit the model to the provided data"""
        y = preprocessing.MultiLabelBinarizer().fit_transform(
            y.reshape(len(y), 1))
        y = y.astype(np.float32)

        kwargs = {}
        if log_run:
            cb = tf.compat.v1.keras.callbacks.TensorBoard(
                log_dir=self.tensor_logdir,
                #histogram_freq=1,
                #write_graph=True,
                #write_grads=True,
                write_images=True,
            )
            kwargs['callbacks'] = [cb]

        history = self.model.fit(
            X,
            y,
            self.batch_size,
            self.n_epoch,
            verbose=2,
            validation_split=0.1,  # XXX
            **kwargs)

        # The history.history dict contains lists of numpy.float64
        # values which don't work well with json. We need to turn them
        # into floats.
        ret = {}
        for k, v in history.history.items():
            ret[k] = [float(x) for x in v]

        return ret
def train(X, y, outpath=None, verbose=True):
    def build(X, y=None):
        """
        Inner build function that builds a single model.
        """
        model = Pipeline([('preprocessor', NLTKPreprocessor()),
                          ('vectorizer', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('clf', OneVsRestClassifier(LinearSVC(C=0.9)))])

        model.fit(X, y)
        return model

    # Label encode the targets
    labels = preprocessing.MultiLabelBinarizer()
    y = labels.fit_transform(y)

    model = build(X, y)
    model.labels_ = labels

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

            if verbose: print("Model written out to {}".format(outpath))

    return model
示例#9
0
def find_hot_encoders(X, missing_values=None):
    """ Find hot encoders for every feature

    :param X: A numpy matrix of the data. First axis corresponding to instances, second axis corresponding to samples
    :param missing_values: The value for missing values.
    :return: Hot encoders to be used for future hot encoding
    """

    X = np.asarray(X)

    new_X = np.zeros(([X.shape[0], 0]))
    hot_encoders = []

    for i in range(X.shape[1]):

        # Copy a new row and delete the missing values
        new_col = np.copy(X[:, i:i + 1])
        new_col = np.delete(new_col, new_col == missing_values, axis=0)

        # Find out if data is categorical
        try:
            new_col = new_col.astype(float)
            hot_encoder = None
        except:

            # Create hot encoder en use it for fitting and transformation
            hot_encoder = PP.MultiLabelBinarizer()
            new_col = hot_encoder.fit_transform(new_col)

        # Keep record of the new data and
        hot_encoders.append(hot_encoder)

    return hot_encoders
示例#10
0
def model_inferring(val_path='./pelvis_only_224_test_hot_nonhardware.npz', model_path='./fx_models/nonhardware_fx_sep/final_dense_model'):
    image_array, id_array, label_array, orig_idx_array = read_npz_hotlabel(val_path)
    model = keras.models.load_model(model_path)
    predictions = model.predict(image_array)

    custom_eval_models(predictions, label_array)

    class_0_preds = predictions[:, 0].reshape(predictions.shape[0], 1)
    non_fx_preds = np.sum(predictions[:, 1:], axis=1)
    non_fx_preds = non_fx_preds.reshape(non_fx_preds.shape[0], 1)
    binary_preds = np.concatenate([class_0_preds, non_fx_preds], axis=1)

    ind_labels = np.argmax(label_array, axis=1)
    binary_labels = np.where(ind_labels != 0, 1, ind_labels)
    binary_labels = preprocessing.MultiLabelBinarizer(np.arange(2)).fit_transform(binary_labels.reshape(binary_labels.shape[0], 1))

    custom_eval_models(binary_preds, binary_labels)

    np.savez('something',
             image_array=image_array,
             id_array=id_array,
             label_array=label_array,
             orig_idx_array=orig_idx_array,
             predictions=predictions,
             binary_preds=binary_preds,
             binary_labels=binary_labels
             )
示例#11
0
def train(method=0):
    print 'command train'
    print 'use %s' % CLFS_NAMES[method]

    description_list = []
    tags_list = []

    for line in open('train.data'):
        tmp = line.rstrip('\r\n').split('#$#')
        description = tmp[1]
        tags = tmp[2].rstrip(',').split(
            ',')  # TODO Semantic Web (RDF, OWL, etc.)
        description_list.append(description)
        tags_list.append(tags)

    all_tags = open(
        'allTags.txt').read().splitlines()  # TODO how to fit this ?
    lb = preprocessing.MultiLabelBinarizer()
    binary_tags_list = lb.fit_transform(tags_list)

    X_train = np.array(description_list)
    y_train = binary_tags_list

    clf = Pipeline([('vectorizer', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', OneVsRestClassifier(CLFS[method]))])

    print 'train begin'

    clf.fit(X_train, y_train)

    print 'train end'

    joblib.dump(clf, 'models/model.pkl')
    joblib.dump(lb, 'models/lb.pkl')
def OVR_Classify(X_train, X_test, Y_train, word_dict, tags_dict, test_tags=None):
    print('Processing : OVR_Classify')
    print('-' * 50)
    from sklearn.feature_extraction.text import TfidfTransformer
    vectorizer = CountVectorizer(min_df=1, vocabulary=word_dict)
    X_v_train = vectorizer.fit_transform(X_train)
    X_v_test = vectorizer.fit_transform(X_test)

    transformer = TfidfTransformer(smooth_idf=False)
    X_train_tf = transformer.fit_transform(X_v_train)
    X_test_tf = transformer.fit_transform(X_v_test)


    #uniq_tags_names = list(tags_dict.keys())
    mlb = preprocessing.MultiLabelBinarizer(classes=list(tags_dict))
    train_model = mlb.fit_transform(Y_train)
    classifier = OneVsRestClassifier(Perceptron(#loss='hinge',
                                                alpha=1e-3,
                                                penalty='elasticnet',
                                                random_state=999,
                                                #class_weight="balanced",
                                                n_iter=50,
                                                #learning_rate='optimal'
    ))

    classifier.fit(X_train_tf, train_model)
    print('-' * 50)
    #print('Score oneVsRest_SGDC_TfIdf : {}'.format(score))
    print('-' * 50)
    Y_pred = classifier.predict(X_test_tf)
    print(Y_pred)
    Y_back = mlb.inverse_transform(Y_pred)
    print(Y_back)
示例#13
0
def explicitness_per_factor(mus_train, y_train, mus_test, y_test):
    """Compute explicitness score for a factor as ROC-AUC of a classifier.

  Args:
    mus_train: Representation for training, (num_codes, num_points)-np array.
    y_train: Ground truth factors for training, (num_factors, num_points)-np
      array.
    mus_test: Representation for testing, (num_codes, num_points)-np array.
    y_test: Ground truth factors for testing, (num_factors, num_points)-np
      array.

  Returns:
    roc_train: ROC-AUC score of the classifier on training data.
    roc_test: ROC-AUC score of the classifier on testing data.
  """
    x_train = np.transpose(mus_train)
    x_test = np.transpose(mus_test)
    # CHANGED: Explicitly use the default params from numpy 0.20 (solver,
    # multi_class) to avoid warning messages
    clf = linear_model.LogisticRegression(solver='liblinear',
                                          multi_class='ovr').fit(
                                              x_train, y_train)
    y_pred_train = clf.predict_proba(x_train)
    y_pred_test = clf.predict_proba(x_test)
    mlb = preprocessing.MultiLabelBinarizer()
    roc_train = metrics.roc_auc_score(
        mlb.fit_transform(np.expand_dims(y_train, 1)), y_pred_train)
    roc_test = metrics.roc_auc_score(
        mlb.fit_transform(np.expand_dims(y_test, 1)), y_pred_test)
    return roc_train, roc_test
示例#14
0
def explicitness_per_factor(mus_train, y_train, mus_test, y_test):
    """Compute explicitness score for a factor as ROC-AUC of a classifier.

  Args:
    mus_train: Representation for training, (num_codes, num_points)-np array.
    y_train: Ground truth factors for training, (num_factors, num_points)-np
      array.
    mus_test: Representation for testing, (num_codes, num_points)-np array.
    y_test: Ground truth factors for testing, (num_factors, num_points)-np
      array.

  Returns:
    roc_train: ROC-AUC score of the classifier on training data.
    roc_test: ROC-AUC score of the classifier on testing data.
  """
    x_train = np.transpose(mus_train)
    x_test = np.transpose(mus_test)
    clf = linear_model.LogisticRegression().fit(x_train, y_train)
    y_pred_train = clf.predict_proba(x_train)
    y_pred_test = clf.predict_proba(x_test)
    mlb = preprocessing.MultiLabelBinarizer()
    roc_train = metrics.roc_auc_score(
        mlb.fit_transform(np.expand_dims(y_train, 1)), y_pred_train)
    roc_test = metrics.roc_auc_score(
        mlb.fit_transform(np.expand_dims(y_test, 1)), y_pred_test)
    return roc_train, roc_test
示例#15
0
    def __init__(self, domain: Domain) -> None:
        self.intent_enc = preprocessing.LabelEncoder()
        intents = set(domain.acts_params + domain.dstc2_acts_sys)
        self.intent_enc.fit([[x] for x in intents])

        self.slot_enc = preprocessing.MultiLabelBinarizer()
        slots = set(domain.requestable_slots + domain.system_requestable_slots)
        self.slot_enc.fit([[x] for x in slots])
示例#16
0
def cosine_model(m, classes=None, **cos_kws):

    encode = pre.MultiLabelBinarizer(classes=classes)
    tags = encode.fit_transform(m)

    cos = cosine_similarity(tags.T.dot(tags))
    cos -= np.diag(cos.diagonal())
    return cos
示例#17
0
    def _encode_labels(self):
        """Encodes string or numeric y labels to integers using MultiLabelBinarizer"""
        mlb = preprocessing.MultiLabelBinarizer()
        mlb.fit(self.y)
        mapping_dict = dict(
            zip(list(range(0, len(mlb.classes_))), mlb.classes_))

        return mlb, mapping_dict
示例#18
0
def main():
    y = [[2, 3, 4], [2], [0, 1, 3], [0, 1, 2, 3, 4], [0, 1, 2]]
    y_transformed = preprocessing.MultiLabelBinarizer().fit_transform(y)
    print('y_transformed =\n', y_transformed)

    multiclass_example()

    multioutput_classification_example()
    multioutput_regression_example()
示例#19
0
def generate_tags(df, hashtag_threshold=200):
    mlb = preprocessing.MultiLabelBinarizer(sparse_output=True)
    add_tag = np.vectorize(lambda x: f"hashtag_{x}")
    hashtags = pd.DataFrame(mlb.fit_transform(df.hashtags).toarray(),
                            columns=add_tag(mlb.classes_))
    hashtag_freq = hashtags.sum(axis=0).sort_values(ascending=False)
    hashtags = hashtags.filter(hashtag_freq.iloc[:hashtag_threshold].index)

    return hashtags
示例#20
0
def trainfunctionclassifier(trees, sents, numproc):
	"""Train a classifier to predict functions tags in trees."""
	from sklearn import linear_model, multiclass, pipeline
	from sklearn import preprocessing, feature_extraction
	from sklearn.model_selection import GridSearchCV
	from sklearn.metrics import make_scorer, jaccard_similarity_score
	vectorizer = pipeline.Pipeline([
			('vectorizer', feature_extraction.DictVectorizer(sparse=True)),
			('scaler', preprocessing.StandardScaler(
				copy=False, with_mean=False))])
	# PTB has no function tags on pretermintals, Negra/Tiger/Lassy do.
	posfunc = any(functions(node) for tree in trees
			for node in tree.subtrees()
			if node and isinstance(node[0], int))
	target = [functions(node) for tree in trees
			for node in tree.subtrees()
			if tree is not node and node
				and (posfunc or isinstance(node[0], Tree))]
	# PTB may have multiple tags (or 0) per node.
	# Negra/Tiger/Lassy have exactly 1 tag for every node.
	multi = any(len(a) > 1 for a in target)
	if multi:
		encoder = preprocessing.MultiLabelBinarizer()
	else:
		encoder = preprocessing.LabelEncoder()
		target = [a[0] if a else '--' for a in target]
	# binarize features (output is a sparse array)
	trainfeats = vectorizer.fit_transform(functionfeatures(node, sent)
			for tree, sent in zip(trees, sents)
				for node in tree.subtrees()
				if tree is not node
				and node and (posfunc or isinstance(node[0], Tree)))
	trainfuncs = encoder.fit_transform(target)
	classifier = linear_model.SGDClassifier(
			loss='hinge',
			penalty='elasticnet',
			n_iter=int(10 ** 6 / len(trees)))
	alphas = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]
	if multi:
		classifier = multiclass.OneVsRestClassifier(
				classifier, n_jobs=numproc or -1)
		param_grid = dict(
				estimator__alpha=alphas)
	else:
		param_grid = dict(alpha=alphas)
	classifier = GridSearchCV(estimator=classifier, param_grid=param_grid,
			scoring=make_scorer(jaccard_similarity_score))
	# train classifier
	classifier.fit(trainfeats, trainfuncs)
	msg = ('trained classifier; grid search results:\n%s\n'
			'multi=%r, posfunc=%r; best score on training set: %g %%\n'
			'parameters: %r\nfunction tags: %s' % (
			'\n'.join(str(a) for a in classifier.grid_scores_),
			multi, posfunc, 100.0 * classifier.best_score_,
			classifier.best_estimator_,
			' '.join(str(a) for a in encoder.classes_)))
	return (classifier, vectorizer, encoder, posfunc, multi), msg
示例#21
0
    def get_encoded_labels(self,
                           training_labels,
                           validation_labels,
                           test_labels,
                           multilabel=False):
        if multilabel:
            training_labels = [
                label.strip().split(self.processing_params.multilabelsplitter)
                for label in training_labels
            ]
            validation_labels = [
                label.strip().split(self.processing_params.multilabelsplitter)
                for label in validation_labels
            ]
            test_labels = [
                label.strip().split(self.processing_params.multilabelsplitter)
                for label in test_labels
            ]

            total_labels = []
            total_labels.extend([
                label for jointlabel in training_labels for label in jointlabel
            ])
            total_labels.extend([
                label for jointlabel in validation_labels
                for label in jointlabel
            ])
            total_labels.extend(
                [label for jointlabel in test_labels for label in jointlabel])
            unique_labels = [np.unique(total_labels)]
            self.label_transform = preprocessing.MultiLabelBinarizer()
        else:
            training_labels = [label.strip() for label in training_labels]
            validation_labels = [label.strip() for label in validation_labels]
            test_labels = [label.strip() for label in test_labels]

            total_labels = training_labels + validation_labels + test_labels
            unique_labels = np.unique(total_labels)
            self.label_transform = preprocessing.LabelEncoder()

        self.label_transform.fit(unique_labels)

        training_labels = self.label_transform.transform(training_labels)
        validation_labels = self.label_transform.transform(validation_labels)
        test_labels = self.label_transform.transform(test_labels)

        if multilabel:
            return training_labels.astype(np.float32), \
              validation_labels.astype(np.float32), \
              test_labels.astype(np.float32), \
              self.label_transform.classes_
        else:
            return training_labels.astype(np.int32), \
              validation_labels.astype(np.int32), \
              test_labels.astype(np.int32), \
              self.label_transform.classes_
示例#22
0
    def fit(self, X, y):

        # TODO Using 2 classes. Extra argument required if we want
        # this to work with more than 2 classes
        n_classes = 2

        n_examples, n_features = X.shape

        iterations = int(n_examples / self.batch_size)

        total_iterations = self.n_epoch * iterations

        # 1 column per value so will be easier later to make this work with multiple classes.
        #y = y.astype(float)
        y = preprocessing.MultiLabelBinarizer().fit_transform(
            y.reshape(len(y), 1))

        # Placeholders for input values.
        self.x = tf.placeholder(tf.float64, [None, n_features], name='x')
        self.y_ = tf.placeholder(tf.float64, [None, n_classes],
                                 name='dataset-y')

        # Variables for computed stuff, we need to initialise them now.
        W = tf.Variable(tf.zeros([n_features, n_classes], dtype=np.float64),
                        name='weights')
        b = tf.Variable(tf.zeros([n_classes], dtype=np.float64), name='bias')

        # Predicted y.
        self.z = tf.matmul(self.x, W) + b
        self.y = tf.nn.softmax(self.z)

        cross_entropy = -tf.reduce_sum(
            self.y_ * tf.log(tf.clip_by_value(self.y, -1.0, 1.0)))
        loss = tf.reduce_mean(cross_entropy)

        # Calculate decay_rate.
        learning_rate = self.calculate_decay_rate(total_iterations)

        train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(
            loss)

        init = tf.initialize_all_variables()
        self.sess.run(init)

        for e in range(self.n_epoch):
            for i in range(iterations):

                offset = i * self.batch_size
                it_end = offset + self.batch_size
                if it_end > n_examples:
                    it_end = n_examples - 1

                batch_xs = X[offset:it_end]
                batch_ys = y[offset:it_end]
                feed = {self.x: batch_xs, self.y_: batch_ys}
                self.sess.run(train_step, feed_dict=feed)
示例#23
0
def init_item_item_simmat():
    t1 = time.time() * 1000
    itemids = user_item_mat.columns
    ITEM_ITEM_SIMMAT = pd.DataFrame(0.0,
                                    index=itemids,
                                    columns=itemids,
                                    dtype='f8')
    # 整理导演字段多标签二值化处理
    movie_data.index = movie_data['movie_id']
    movie_feature_map = pd.get_dummies(movie_data, columns=['director'])
    print('movie_feature_map:', movie_feature_map.info(),
          movie_feature_map.shape)
    # 整理演员字段多标签二值化处理
    L = pd.DataFrame(movie_data['actors'].str.split(
        ' / ', expand=True)).fillna('').values
    import sklearn.preprocessing as sp
    mlb = sp.MultiLabelBinarizer()
    res = pd.DataFrame(mlb.fit_transform(L),
                       columns=mlb.classes_,
                       index=movie_data['movie_id'])
    movie_feature_map2 = pd.concat([movie_feature_map, res], axis=1)
    print('res:', res.info(), res.shape)
    print('movie_feature_map2:', movie_feature_map2.info(),
          movie_feature_map2.shape)

    # 整理电影类型字段多标签二值化处理
    G = pd.DataFrame(movie_data['genres'].str.split(
        ' / ', expand=True)).fillna('').values
    import sklearn.preprocessing as sp
    mlb = sp.MultiLabelBinarizer()
    res = pd.DataFrame(mlb.fit_transform(G),
                       columns=mlb.classes_,
                       index=movie_data['movie_id'])
    movie_feature_map3 = pd.concat([movie_feature_map2, res], axis=1)

    # 计算相关性系数矩阵
    movie_feature_map3.drop(movie_feature_map3.columns[np.arange(
        len(MOVIE_DATA_COLUMNS))],
                            axis=1,
                            inplace=True)
    print(movie_feature_map3.shape)
    ITEM_ITEM_SIMMAT = movie_feature_map3.T.corr()
    return ITEM_ITEM_SIMMAT
示例#24
0
def featureEngineeringOfAmenities(df):
    '''clean the amenities field and convert into list'''
    df['amenities'] = df.apply(lambda x: parse_amenities(x.amenities), axis=1)
    '''OHE the data of ammenities'''
    '''we cannot use getdummies here as each row has a list of amenities.so we are using MultiLabelBinarizer '''
    mlb = preprocessing.MultiLabelBinarizer()
    amenities = pandas.DataFrame(mlb.fit_transform(df['amenities']),
                                 columns=mlb.classes_,
                                 index=df.index)
    amenities = amenities.drop([
        'translation missing: en.hosting_amenity_49',
        'translation missing: en.hosting_amenity_50'
    ],
                               axis=1)
    '''check corelation between amenities'''
    cor_amn = pandas.DataFrame(amenities.corr())
    for col in cor_amn.columns:
        cor_amn.loc[col, col] = numpy.nan
    high_cor = cor_amn.where(cor_amn.abs().gt(.8))
    high_cor = high_cor.dropna(axis=1, how='all')
    high_cor = high_cor.dropna(axis=0, how='all')
    '''highly corelated with bathroom essentials. so remove them'''
    amenities = amenities.drop(
        ['Bath towel', 'Bedroom comforts', 'Body soap', 'Toilet paper'],
        axis=1)
    '''highly corelated with cooking basics. so remove them'''
    amenities = amenities.drop([
        'Dishes and silverware', 'Oven', 'Refrigerator', 'Stove', 'Microwave'
    ],
                               axis=1)
    '''highly corelated with self check in.so remove them'''
    amenities = amenities.drop(['Lockbox'], axis=1)
    '''highly corelated to toilet so remove'''
    amenities = amenities.drop(['Wide clearance to shower'], axis=1)
    '''delete original amenities column'''
    df = df.drop(['amenities'], axis=1)
    '''merge amenities with original data'''
    df = pandas.DataFrame(pandas.concat([df, amenities], axis=1))
    '''remove amenities which are most common or most uncommon'''
    amenities_dist = dict()
    unbalanced_amenities = list()
    for i in amenities.columns:
        freq = df[i].sum().item()
        amenities_dist.update({i: freq})
        if (freq < 1500 or freq > 70000):
            unbalanced_amenities.append(i)
    '''sort by most common'''
    amenities_dist = dict(
        sorted(amenities_dist.items(),
               key=operator.itemgetter(1),
               reverse=True))
    '''get rid of amenities which have less than 3% of 0's or 1's in each column'''
    df = df.drop(unbalanced_amenities, axis=1)
    return (df)
示例#25
0
def pandasToTensor(data, globalVocab):

    data = shuffle(data)

    # # Preprocessing data
    # # retain only text that contain less that 70 tokens to avoid too much padding
    data["token_size"] = data["text"].apply(lambda x: len(x.split(' ')))
    data = data.loc[data['token_size'] < 70].copy()

    # # sampling
    # data = data.sample(n=50000);

    # # construct vocab and indexing
    # inputs = construct.ConstructVocab(data["text"].values.tolist())

    # print(globalVocab.vocab[0:10])

    input_tensor = [[globalVocab.word2idx[s] for s in es.split(' ')]
                    for es in data["text"].values.tolist()]

    # examples of what is in the input tensors
    # print(input_tensor[0:2])

    # calculate the max_length of input tensor
    max_length_inp = util.max_length(input_tensor)
    # print(max_length_inp)

    # inplace padding
    input_tensor = [
        util.pad_sequences(x, max_length_inp) for x in input_tensor
    ]
    # print(input_tensor[0:2])

    ###Binarization
    emotions = list(emotion_dict.values())
    num_emotions = len(emotion_dict)
    # print(emotions)
    # binarizer
    mlb = preprocessing.MultiLabelBinarizer(classes=emotions)
    data_labels = [emos for emos in data[['emotions']].values]
    # print(data_labels)
    bin_emotions = mlb.fit_transform(data_labels)
    target_tensor = np.array(bin_emotions.tolist())

    # print(target_tensor[0:2])
    # print(data[0:2])

    get_emotion = lambda t: np.argmax(t)

    get_emotion(target_tensor[0])
    emotion_dict[get_emotion(target_tensor[0])]

    return input_tensor, target_tensor
示例#26
0
def load_scale_data(file_path, multilabeltf=False):
    X, y = load_svmlight_file(file_path, multilabel=multilabeltf)
    X = X.toarray()
    # print X[:, 0]
    # print X[:, 10]
    # print X[:, 21]
    # X = preprocessing.scale(X)
    # min_max_scaler = preprocessing.MinMaxScaler()
    # X = min_max_scaler.fit_transform(X_dentise)
    if multilabeltf == True:
        y = preprocessing.MultiLabelBinarizer().fit_transform(y)
    return (X, y)
示例#27
0
def prepare_one_hot_encoder(img_list, label_csv_path):
    with open(img_list, "r") as f:
        tmp_list = [i.strip() for i in f.readlines()]
    y = []
    meta_data = pd.read_csv(label_csv_path)
    for pid in tmp_list:
        labels = meta_data.loc[meta_data["Image Index"] == pid, "Finding Labels"]
        tmp = labels.tolist()[0].split("|")
        y.append(tmp)
    encoder = preprocessing.MultiLabelBinarizer()
    encoder.fit(y)
    return encoder
示例#28
0
def prepareTraining(X_train,
                    Y_train,
                    wl,
                    savgol=False,
                    msc=False,
                    rdp=False,
                    justbymoda=True,
                    plot=False):
    """
    fonction qui permet de préparer les données d'entrainement pour la regression pls. En utilisant ou non
    un pré-traitement, Binarise les classes en fonction du choix fait.
    :param X_train: liste des reflectances associées à chaque échantillon
    :param Y_train: liste de classes associées à chaque échantillon
    :param wl: longueurs d'ondes
    :param savgol: booléen si on utilise le pré-traitement Savitzky Golay
    :param msc: booléen si on utilise le pré-traitement MSC
    :param rdp: booléen si on utilise le pré-traitement RDP
    :param justbymoda: booléen on sépare les classes par moda, ou par moda,feuille,das
    :param plot: booléen si on veut un affichage ou non
    :return: une nouvelle liste X de reflectances, une nouvelle liste Y de classes (binaire), la référence ref des
    données passées par MSC (None si msc=False), les longueurs d'ondes wlrdp passées par RDP (=wl si rdp=False)
    """

    #print("prepareTraining: "+str(len(X_train))+" "+str(len(X_train[0])))

    X_train = X_train.astype('float32')
    if savgol is True:
        X_train = pre_traitement.lissageSavitzky(X_train, wl)
    ref = None
    if msc is True:
        X_train, ref = pre_traitement.compute_msc(X_train, wl, plot=plot)
    wlrdp = wl
    if rdp is True:
        wlrdp, X_train = pre_traitement.computeRdpForData(wl,
                                                          X_train,
                                                          Y_train,
                                                          justbymoda,
                                                          plot=plot)

    X = getDatatoDataframe(X_train)

    #binarisation des classes
    lb = preprocessing.MultiLabelBinarizer()
    if justbymoda is True:
        lb = preprocessing.LabelBinarizer()
    Y = lb.fit_transform(Y_train)
    oneShotDictionary = {}
    for i in range(0, Y.shape[0]):
        #        print("Binarized labels training: "+str(Y[i])+" - "+str(Y_train[i])+" -> "+str(str(Y[i]) in oneShotDictionary))
        if (str(Y[i]) in oneShotDictionary) is False:
            oneShotDictionary[str(Y[i])] = Y_train[i]
    return oneShotDictionary, X, Y, ref, wlrdp
示例#29
0
 def _labels_encoder(self):
     """
     prepare labels encoder from string to digits
     """
     pd_meta = self._load_fold_list(fold=1, data_split=self.DataSplit.train)
     labels_list = pd_meta[self.MetaCol.LAB].astype(str)
     if self.is_multilabel:
         le = sk_proc.MultiLabelBinarizer()
         labels_list = labels_list.str.split(self.LABEL_SEPARATOR)
     else:
         le = sk_proc.LabelEncoder()
     le.fit(labels_list)
     return le
示例#30
0
def ro_gt(start_time, end_time, feature_length):
  start_time=dt.strptime(start_time, "%Y-%m-%d");
  end_time=dt.strptime(end_time, "%Y-%m-%d");
  fl = str(feature_length) + "s";
  timestamps=pd.date_range(start_time, end_time, freq=fl);
  dred_df = pd.DataFrame.from_csv('../dataset/Occupancy_data_split.csv');
  # need to use different groupby, that can adjust based on feature length
  dred_fl = dred_df.groupby(pd.TimeGrouper(fl))[u'room'].apply(set).apply(list);
  dred_fl = dred_fl.apply(lambda x: float('NaN') if len(x)==0 else x).dropna();
  mlb = preprocessing.MultiLabelBinarizer();
  dred_bin = mlb.fit_transform(dred_fl);
  dred_bin_df = pd.DataFrame(data=dred_bin, columns=list(mlb.classes_), index=dred_fl.index);
  return dred_bin_df.loc[timestamps].dropna();