Пример #1
0
def MlKnn_with_Grid_Parameters(X_train, X_test, y_train, y_test):

    X_train = lil_matrix(X_train).toarray()
    y_train = lil_matrix(y_train).toarray()
    X_test = lil_matrix(X_test).toarray()
    y_test = lil_matrix(y_test).toarray()

    print("MlKnn")
    model = MLkNN(k=5, s=0.2).fit(X_train, y_train)
    hamming = hamming_loss(y_test, model.predict(X_test))
    Subset_Accuracy = accuracy_score(y_test, model.predict(X_test))
    Precision = precision_score(y_test, model.predict(X_test), average="micro")
    Recall = recall_score(y_test, model.predict(X_test), average='micro')
    f1 = f1_score(y_test, model.predict(X_test), average='micro')

    print("Hamming: " + str(hamming_loss(y_test, model.predict(X_test))))
    print("Subset Accuracy: " +
          str(accuracy_score(y_test, model.predict(X_test))))
    print("Precision: " +
          str(precision_score(y_test, model.predict(X_test), average="micro")))
    print("Recall: " +
          str(recall_score(y_test, model.predict(X_test), average='micro')))
    print("F1 score: " +
          str(f1_score(y_test, model.predict(X_test), average='micro')))

    print("\n")

    return hamming, Subset_Accuracy, Precision, Recall, f1
def main():
    data = readData("IMDB-Movie-Data.csv")
    genres = data["Genre"]
    descriptions = data["Description"]
    labels = getLabels(genres)
    calculateNgrams(descriptions)

    features = list(map(extract_features, descriptions))
    print len(features[1])
    # X = features
    # Y = Labels
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.33,
                                                        random_state=42)
    #binRel(X_train, X_test, y_test, y_train)
    classifier = MLkNN(k=4)
    # Train
    classifier.fit(X_train, y_train)
    #predict
    #print X_test
    predictions = classifier.predict(np.array(X_test))
    print('Hamming loss: {0}'.format(
        sklearn.metrics.hamming_loss(y_test, predictions)))  #(y_true, y_pred)
    ''''
def adapted(data):

    classifier = MLkNN(k=20)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    accuracyScore = accuracy_score(y_test, predictions)
    return None
Пример #4
0
def get_cado_predictions():
    data_path = '../../datasets/cado/train.csv'
    test_path = '../../datasets/cado/test.csv'

    data = du.load_data(data_path)
    test = du.load_data(test_path)

    text_index = 6
    label_start_index = 7
    X = [d[text_index] for d in data]
    labels = [d[label_start_index:label_start_index + 12] for d in data]

    X_test = [d[text_index] for d in test]
    labels_test = [d[label_start_index:label_start_index + 12] for d in test]

    Y = np.array(labels, dtype='int')
    y_test = np.array(labels_test, dtype='int')
    #Y = np.array(binary_labels, dtype='int')

    test_index = len(X)

    X = X + X_test
    Y = np.vstack([Y, y_test])

    tokenizer = tokenize_data(X)
    word_index = tokenizer.word_index

    sequences = tokenizer.texts_to_sequences(X)

    X = pad_sequences(sequences,
                      maxlen=700,
                      padding="post",
                      truncating="post",
                      value=0)

    num_words = min(MAX_NB_WORDS, len(word_index) + 1)
    embedding_matrix = np.zeros((num_words, 1))

    for word, i in word_index.items():
        if i >= MAX_NB_WORDS:
            continue
        embedding_matrix[i] = 1

    X_train = X[0:test_index, :]
    Y_train = Y[0:test_index, :]
    x_test = X[test_index:len(X), :]
    y_test = Y[test_index:len(Y), :]

    classifier = MLkNN()
    classifier.fit(X_train, Y_train)
    predictions = classifier.predict(x_test)
    scores = classifier.predict_proba(x_test)
    y_pred = predictions.toarray()
    y_score = scores.toarray()

    return y_pred, y_score
Пример #5
0
def mlknn(x_tr, y_tr, x_te, x_va=None):
    """
    mlknn
    :param x_tr:
    :param y_tr:
    :param x_te:
    :param x_va:
    :return:
    """
    pred = MLkNN(k=10, s=True)
    y_tr = np.int32(y_tr)
    pred.fit(x_tr, y_tr)

    if x_va is None:
        y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te))
        return y_te_
    else:
        y_te_ = sparse.dok_matrix.toarray(pred.predict(x_te))
        y_va_ = sparse.dok_matrix.toarray(pred.predict(x_va))
        return y_te_, y_va_
def mlknn(train_data_inx,y_train,test_data_inx):
	classifier = MLkNN(k=mlknn_k)
	x_train = []
	x_test = []
	for i in range(len(train_data_inx)):
		x_train.append(corpus_tfidf[train_data_inx[i]])
	for j in range(len(test_data_inx)):
		x_test.append(corpus_tfidf[test_data_inx[j]])
	classifier.fit(csr_matrix(x_train), csr_matrix(y_train))
	mlknn_pre = classifier.predict(csr_matrix(x_test))
	mlknn_pre = mlknn_pre.toarray()
	return mlknn_pre
Пример #7
0
    def mlknn(self, number):
        classifier = MLkNN(k=number)

        classifier.fit(self.X_train, self.y_train)

        # predict
        predictions = classifier.predict(self.X_test)
        result = hamming_loss(self.y_test, predictions)

        print("hanming_loss,",result)

        result = f1_score(self.y_test, predictions, average='micro')
        print("micro -f1: ", result)

        result = precision_score(self.y_test, predictions,average='micro')
        print(result)
Пример #8
0
    def train(self):

        classifier_new = MLkNN(k=10)

        x_train = lil_matrix(self.x_data).toarray()
        y_train = lil_matrix(self.y_data).toarray()
        x_test = lil_matrix(self.x_test).toarray()

        classifier_new.fit(x_train, y_train)

        # predict
        predictions = classifier_new.predict(x_test)

        return {
            'accuracy': accuracy_score(self.y_test, predictions),
            'f1_score': f1_score(self.y_test, predictions, average='micro')
        }
Пример #9
0
    def MLkNN(self):
        self.sub_parser.add_argument('--library',
                                     action='store_true',
                                     default=False)

        args = self.sub_parser.parse_args(sys.argv[2:])
        print 'Running ML-kNN, arguments=%s' % args
        print 'Loading %s data...' % args.N

        if args.f == 'My_dict':
            vectorizer = my_dict_vectorizer(stop=not args.nostop,
                                            bigram=args.bigram)
        elif args.f == 'LIB_count':
            vectorizer = lib_count_vectorizer(stop=not args.nostop,
                                              bigram=args.bigram)
        elif args.f == 'LIB_hash':
            vectorizer = lib_hash_vectorizer(stop=not args.nostop,
                                             bigram=args.bigram)
        elif args.f == 'LIB_tfidf':
            vectorizer = lib_tfidf_vectorizer(stop=not args.nostop,
                                              bigram=args.bigram)

        data = load_data(args.N, args.D, args.Nt, vectorizer)
        print 'Done loading data, actual feature size:', data[1].shape

        X, Y, Xt, Yt, cats = data
        if args.library:
            from skmultilearn.adapt import MLkNN
            model = MLkNN()
        else:
            from sklearn.neighbors import NearestNeighbors
            from multi import MLkNN
            model = MLkNN(NearestNeighbors)
        model.fit(X, Y)
        Yp = model.predict(Xt)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            hl = computeMetrics(Yp, Yt, cats)

        print 'the hamming loss:'
        print '>>  ', hl
        from sklearn.metrics import (hamming_loss, classification_report)
        print 'hamming loss(library):', hamming_loss(Yt, Yp)
        print classification_report(Yt, Yp, target_names=cats)
        print 'DONE..'
Пример #10
0
    def adapt(X_train, y_train, X_test, y_test):

        y_train = y_train.to_sparse().to_coo()
        y_test = y_test.to_sparse().to_coo()

        from skmultilearn.adapt import MLkNN
        classifier = MLkNN(k=4)

        print("Train Adapted algorithm")

        classifier.fit(X_train, y_train)

        print("Predict")
        predictions = classifier.predict(X_test)

        from sklearn.metrics import accuracy_score

        print("Accuracy")
        print(y_test.shape, predictions.shape)
        print(accuracy_score(y_test.toarray(), predictions))
    for i in range(7):
        for j in range(7):
            block = x_luv[32 * i:32 * (i + 1), 32 * j:32 * (j + 1)]
            mean, var = np.mean(block,
                                axis=tuple(range(block.ndim - 1))), np.var(
                                    block, axis=tuple(range(block.ndim - 1)))
            l = np.concatenate((l, mean))
            l = np.concatenate((l, var))
    x_test.append(l)
x_train = np.asarray(x_train).astype(np.float32)
x_test = np.asarray(x_test).astype(np.float32)
y_test = np.asarray(y_test)
y_train = np.asarray(y_train)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

classifier = MLkNN(k=9)
classifier.fit(x_train, y_train)
with open('mlknn-k-9-luv.pkl', 'wb') as f:
    pickle.dump(classifier, f)
'''
with open('mlknn-k-9-luv.pkl', 'rb') as f:
    classifier = pickle.load(f)
'''
predictions = classifier.predict(x_test).todense()
print('all match:',
      np.sum(np.all(predictions == y_test, axis=1)) / len(y_test))
print('at least one match:',
      (np.sum(np.all(predictions - y_test <= 0, axis=1)) -
       np.sum(np.all(predictions == 0, axis=1))) / len(y_test))
print('binary :', np.sum(predictions == y_test) / (5 * len(y_test)))
Пример #12
0
def run():
    parser = get_arg_parser()
    cmd_args = parser.parse_args()

    if cmd_args.gpu is not None:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu)
        gpunum = os.getenv('CUDA_VISIBLE_DEVICES')
        logging.info("GPU has been set to {}".format(gpunum))

    logging.info("Model used for the regression network: {}"
                 .format(cmd_args.model_name))

    # 1. Dataset retrieval
    # --------------------

    tab_printer(constants.Dataset)
    dataset = Dataset(nrows=constants.Dataset.nrows,
                      augment_labels=constants.Dataset.augment_labels,
                      top_n=constants.Dataset.top_n)

    logging.info("Going to create vocabulary and fit a preprocessing pipeline"
                 "using {} samples. Settings will be listed below"
                 .format(len(dataset.X_train)))

    # 2. Preprocessing
    # -----------------

    tab_printer(constants.NLP)
    preprocessor = Preprocessing(dataset.X_train)

    # Preprocess documents
    X_train = preprocessor.transform_documents(dataset.X_train)
    X_test = preprocessor.transform_documents(dataset.X_test)

    # 3. Word embeddings with word2vec
    # --------------------------------

    # Train word2vec embeddings if train_word2vec option is selected
    if cmd_args.train_word2vec: utils.embeddings.main()
    weights = get_embedding_tensor(preprocessor)

    # 4. Node embeddings with AttentionWalk
    # -------------------------------------
    args = _generate_deepwalk_parameters(dataset.y_train_graph)
    if cmd_args.train_attentionwalk: train_attention_walk(args)

    graph_embeddings = pd.read_csv(args.embedding_path).iloc[:, 1:].values

    # Get document representations using node embeddings
    y_embedded = _get_label_embeddings(dataset.y_train, graph_embeddings)
    y_test_embedded = _get_label_embeddings(dataset.y_test, graph_embeddings)

    # 5. Regressor Training
    # ---------------------

    device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \
        if torch.cuda.is_available() else 'cpu'

    regressor_nn = NeuralNet(
        get_network_class(cmd_args.model_name),
        max_epochs=constants.NeuralNetworkTraining.epochs,
        lr=constants.NeuralNetworkTraining.learning_rate,
        batch_size=constants.NeuralNetworkTraining.batch_size,
        optimizer=torch.optim.Adam,
        criterion=torch.nn.MSELoss,

        module__output_dim=args.dimensions,
        module__embedding=weights,
        module__embedding_dim=constants.NLP.embedding_size,

        device=device,
        train_split=None,
    )

    # Train the regressor neural network
    regressor_nn.fit(X_train, y_embedded.astype(np.float32))

    # 6. Train Multi-label KNN algorithm
    # ----------------------------------

    tab_printer(constants.MLKNN)

    # Train multi-label KNN to turn label embeddings into label predictions
    classifier = MLkNN(k=constants.MLKNN.k, s=constants.MLKNN.s)
    classifier.fit(y_embedded, dataset.y_train)

    # 7. Evaluation
    # -------------

    # Label prediction with documents
    y_test_pred = regressor_nn.predict(X_test)
    preds = classifier.predict(y_test_pred)
    preds_raw = classifier.predict_proba(y_test_pred)

    # Label prediction with label embeddings
    preds_w_labels = classifier.predict(y_test_embedded)
    preds_w_labels_raw = classifier.predict_proba(y_test_embedded)

    # Log evaluation result with label embeddings
    eval_metrics_w_labels = evaluation \
        .all_metrics(preds_w_labels.toarray(),
                     dataset.y_test,
                     yhat_raw=preds_w_labels_raw.toarray())

    logging.info(str(eval_metrics_w_labels))

    # Log evaluation result with documents
    report_evaluation(preds.toarray(),
                      dataset.y_test,
                      yhat_raw=preds_raw.toarray())
Пример #13
0
class Model(object):
    """Fully connected neural network with no hidden layer."""
    def __init__(self, metadata):
        """
    Args:
      metadata: an AutoDLMetadata object. Its definition can be found in
          AutoDL_ingestion_program/dataset.py
    """
        self.done_training = False
        self.metadata = metadata
        self.output_dim = self.metadata.get_output_size()
        self.imputer = Imputer(missing_values='NaN',
                               strategy='mean',
                               axis=0,
                               verbose=0,
                               copy=True)
        self.model = MLkNN(k=20)
        self.step = 0
        self.lgb_round = 80

    def train(self, dataset, remaining_time_budget=None):
        """Train this algorithm on the tensorflow |dataset|.
    This method will be called REPEATEDLY during the whole training/predicting
    process. So your `train` method should be able to handle repeated calls and
    hopefully improve your model performance after each call.

    ****************************************************************************
    ****************************************************************************
    IMPORTANT: the loop of calling `train` and `test` will only run if
        self.done_training = False
      (the corresponding code can be found in ingestion.py, search
      'M.done_training')
      Otherwise, the loop will go on until the time budget is used up. Please
      pay attention to set self.done_training = True when you think the model is
      converged or when there is not enough time for next round of training.
    ****************************************************************************
    ****************************************************************************

    Args:
      dataset: a `tf.data.Dataset` object. Each of its examples is of the form
            (example, labels)
          where `example` is a dense 4-D Tensor of shape
            (sequence_size, row_count, col_count, num_channels)
          and `labels` is a 1-D Tensor of shape
            (output_dim,).
          Here `output_dim` represents number of classes of this
          multilabel classification task.

          IMPORTANT: some of the dimensions of `example` might be `None`,
          which means the shape on this dimension might be variable. In this
          case, some preprocessing technique should be applied in order to
          feed the training of a neural network. For example, if an image
          dataset has `example` of shape
            (1, None, None, 3)
          then the images in this datasets may have different sizes. On could
          apply resizing, cropping or padding in order to have a fixed size
          input tensor.

      remaining_time_budget: time remaining to execute train(). The method
          should keep track of its execution time to avoid exceeding its time
          budget. If remaining_time_budget is None, no time budget is imposed.
    """
        if self.done_training:
            return
        self.step += 1
        # print(f'dataset: {dataset}')
        t1 = time.time()
        # Count examples on training set
        if not hasattr(self, 'num_examples_train'):
            logger.info("Counting number of examples on train set.")
            dataset = dataset.batch(128)
            iterator = dataset.make_one_shot_iterator()
            next_element = iterator.get_next()
            X = []
            Y = []
            with tf.Session(config=tf.ConfigProto(
                    log_device_placement=False)) as sess:
                while True:
                    try:
                        example, labels = sess.run(next_element)
                        example = np.squeeze(example)
                        X.extend(example)
                        Y.extend(labels)
                    except tf.errors.OutOfRangeError:
                        break
            self.X_train = np.array(X)
            self.y_train = np.array(Y)
            print('self.X_train.shape: {}'.format(self.X_train.shape))
            print('self.y_train.shape: {}.'.format(self.y_train.shape))
            self.num_examples_train = len(self.y_train)
            logger.info("Finished counting. There are {} examples for training set." \
                        .format(self.num_examples_train))
        print('spand time: {}'.format(time.time() - t1))
        if self.lgb_round >= 300 or self.step > 10:
            self.done_training = True
            return
        if hasattr(self, 'test_duration'):
            round = int(50 * self.test_duration + 5)
            self.lgb_round += round
        train_start = time.time()
        self.X_train = self.imputer.fit_transform(self.X_train)
        self.model.fit(self.X_train, self.y_train)
        train_end = time.time()

        # Update for time budget managing
        train_duration = train_end - train_start
        logger.info("{} step. {:.2f} sec used. ".format(
            self.step, train_duration))

        self.done_training = True

    def test(self, dataset, remaining_time_budget=None):
        """Test this algorithm on the tensorflow |dataset|.

    Args:
      Same as that of `train` method, except that the `labels` will be empty.
    Returns:
      predictions: A `numpy.ndarray` matrix of shape (sample_count, output_dim).
          here `sample_count` is the number of examples in this dataset as test
          set and `output_dim` is the number of labels to be predicted. The
          values should be binary or in the interval [0,1].
    """
        # Count examples on test set
        if not hasattr(self, 'num_examples_test'):
            logger.info("Counting number of examples on test set.")
            dataset = dataset.batch(128)
            iterator = dataset.make_one_shot_iterator()
            example, labels = iterator.get_next()
            X = []
            with tf.Session(config=tf.ConfigProto(
                    log_device_placement=False)) as sess:
                while True:
                    try:
                        ex = sess.run(example)
                        ex = np.squeeze(ex)
                        X.extend(ex)
                    except tf.errors.OutOfRangeError:
                        break
            self.X_test = np.array(X)
            self.num_examples_test = self.X_test.shape[0]
            logger.info("Finished counting. There are {} examples for test set." \
                        .format(self.num_examples_test))

        test_begin = time.time()
        logger.info("Begin testing...")
        self.X_test = self.imputer.fit_transform(self.X_test)
        predictions = self.model.predict(self.X_test).A
        # print(type(predictions))
        # print(predictions.A)
        # preds = self.model.predict_proba(self.X_test)
        # print(preds)
        # test_results = pd.Series(test_results).map(self.remps).values
        # predictions = self.bin2y(test_results)
        # print(predictions)
        test_end = time.time()
        # Update some variables for time management
        self.test_duration = test_end - test_begin
        logger.info("[+] Successfully made one prediction. {:.2f} sec used. " \
                    .format(self.test_duration) + \
                    "Duration used for test: {:2f}".format(self.test_duration))
        return predictions

    def y2bin(self, y):
        res = y[:, 0]
        for i in range(1, y.shape[1]):
            res *= 2
            res += y[:, i]
        return res

    def bin2y(self, bin):
        y = np.array([bin % 2]).T
        i = 1
        while i < self.output_dim:
            i += 1
            bin = bin // 2
            y = np.c_[np.array([bin % 2]).T, y]
            # y = np.insert(y, 0, values=bin%2, axis=1)
        return y
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
classifier_new = MLkNN(k=10)
# Note that this classifier can throw up errors when handling sparse matrices.
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
# train
classifier_new.fit(x_train, y_train)
# predict
predictions_new = classifier_new.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions_new))
print("\n")

# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())
# train
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")
Пример #15
0
def train_cnn_rnn(input_file, training_config):
    # read data and params
    x_, y_, vocabulary, vocabulary_inv, df, label_dict = data_helper_multi.load_data(
        input_file)
    params = json.loads(open(training_config).read())

    # create a directory, everything related to the training will be saved in this directory
    timestamp = str(int(time.time()))
    output_dir = os.path.join('data_path_save', 'cnn_rnn_' + timestamp)
    trained_dir = os.path.join(output_dir, 'trained_results')
    if os.path.exists(trained_dir):
        shutil.rmtree(trained_dir)
    os.makedirs(trained_dir)

    # assign a 300 dimension vector to each word
    word_embeddings = data_helper_multi.load_embeddings(vocabulary)
    embedding_mat = [
        word_embeddings[word] for index, word in enumerate(vocabulary_inv)
    ]
    embedding_mat = np.array(embedding_mat, dtype=np.float32)

    # split the original dataset into trainset and devset
    x_train, x_dev, y_train, y_dev = train_test_split(x_, y_, test_size=0.1)
    # split the trainset into trainset and devset
    logging.info('x_train: {}, x_dev: {}'.format(len(x_train), len(x_dev)))

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat,
                                 sequence_length=x_train.shape[1],
                                 num_classes=y_train.shape[1],
                                 non_static=params['non_static'],
                                 hidden_unit=params['hidden_unit'],
                                 max_pool_size=params['max_pool_size'],
                                 filter_sizes=map(
                                     int, params['filter_sizes'].split(",")),
                                 num_filters=params['num_filters'],
                                 embedding_size=params['embedding_dim'],
                                 l2_reg_lambda=params['l2_reg_lambda'])
            global_step = tf.Variable(0, name='global_step', trainable=False)
            optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9)
            grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)
            checkpoint_dir = os.path.join(output_dir, 'checkpoints')
            if os.path.exists(checkpoint_dir):
                shutil.rmtree(checkpoint_dir)
            os.makedirs(checkpoint_dir)
            checkpoint_prefix = os.path.join(checkpoint_dir, 'model')

            def real_len(batches):
                return [
                    np.ceil(
                        np.argmin(batch + [0]) * 1.0 / params['max_pool_size'])
                    for batch in batches
                ]

            def train_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    params['dropout_keep_prob'],
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch)
                }
                _, step, loss, scores = sess.run(
                    [train_op, global_step, cnn_rnn.loss, cnn_rnn.scores],
                    feed_dict=feed_dict)
                return scores

            def dev_step(x_batch, y_batch):
                feed_dict = {
                    cnn_rnn.input_x:
                    x_batch,
                    cnn_rnn.input_y:
                    y_batch,
                    cnn_rnn.dropout_keep_prob:
                    1.0,
                    cnn_rnn.batch_size:
                    len(x_batch),
                    cnn_rnn.pad:
                    np.zeros([len(x_batch), 1, params['embedding_dim'], 1]),
                    cnn_rnn.real_len:
                    real_len(x_batch)
                }
                step, loss, scores = sess.run(
                    [global_step, cnn_rnn.loss, cnn_rnn.scores],
                    feed_dict=feed_dict)
                return step, loss, scores

            saver = tf.train.Saver()
            sess.run(tf.global_variables_initializer())

            # training starts here
            train_batches = data_helper_multi.batch_iter(
                list(zip(x_train, y_train)), params['batch_size'],
                params['num_epochs'])
            best_accuracy, best_at_step = 0, 0
            x_train_fit = np.zeros([
                params['batch_size'] * params['evaluate_every'],
                len(label_dict.items())
            ])
            y_train_fit = np.zeros([
                params['batch_size'] * params['evaluate_every'],
                len(label_dict.items())
            ])
            for train_batch in train_batches:
                x_train_batch, y_train_batch = zip(*train_batch)
                scores = train_step(x_train_batch, y_train_batch)
                current_step = tf.train.global_step(sess, global_step)
                x_train_fit[(current_step % params['evaluate_every']) *
                            params['batch_size']:
                            (current_step % params['evaluate_every']) *
                            params['batch_size'] +
                            params['batch_size']] = scores
                y_train_fit[(current_step % params['evaluate_every']) *
                            params['batch_size']:
                            (current_step % params['evaluate_every']) *
                            params['batch_size'] +
                            params['batch_size']] = y_train_batch

                if current_step % params['evaluate_every'] == 0:
                    clf = MLkNN(k=4)
                    clf.fit(x_train_fit, y_train_fit)
                    dev_batches = data_helper_multi.batch_iter(
                        list(zip(x_dev, y_dev)), params['batch_size'], 1)
                    total_batches_dev = len(x_dev) // params['batch_size']
                    x_dev_fit = np.zeros([
                        params['batch_size'] * total_batches_dev,
                        len(label_dict.items())
                    ])
                    y_dev_fit = np.zeros([
                        params['batch_size'] * total_batches_dev,
                        len(label_dict.items())
                    ])
                    for step_dev, dev_batch in enumerate(dev_batches):
                        x_dev_batch, y_dev_batch = zip(*dev_batch)
                        step, loss, scores = dev_step(x_dev_batch, y_dev_batch)
                        x_dev_fit[step_dev * params['batch_size']:step_dev *
                                  params['batch_size'] +
                                  params['batch_size']] = scores
                        y_dev_fit[step_dev * params['batch_size']:step_dev *
                                  params['batch_size'] +
                                  params['batch_size']] = y_dev_batch
                    y_dev_preds = clf.predict(x_dev_fit)
                    y_dev_preds = y_dev_preds.toarray()
                    y_union = y_dev_preds + y_dev_fit
                    accuracy = float(np.sum(y_union == 2)) / float(
                        np.sum(y_union == 1) + np.sum(y_union == 2))
                    precision = float(np.sum(y_union == 2)) / float(
                        np.sum(y_dev_preds == 1))
                    recall = float(np.sum(y_union == 2)) / float(
                        np.sum(y_dev_fit == 1))
                    f1 = 2 * precision * recall / (precision + recall)
                    logging.info('Accuracy on dev set: {}'.format(accuracy))
                    logging.info('Precision on dev set: {}'.format(precision))
                    logging.info('Recall on dev set: {}'.format(recall))
                    logging.info('F1-measure on dev set: {}'.format(f1))

                    if accuracy >= best_accuracy:
                        best_accuracy, best_at_step = accuracy, current_step
                        path = saver.save(sess,
                                          checkpoint_prefix,
                                          global_step=current_step)
                        logging.critical('Saved model {} at step {}'.format(
                            path, best_at_step))
                        logging.critical('Best accuracy {} at step {}'.format(
                            best_accuracy, best_at_step))
            logging.critical(
                'Training is complete, testing the best model on x_test and y_test'
            )

    # save trained params and files
    with open(trained_dir + '/words_index.json', 'w') as outfile:
        json.dump(vocabulary, outfile, indent=4, ensure_ascii=False)
    with open(trained_dir + '/embeddings.pickle', 'wb') as outfile:
        pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL)
    with open(trained_dir + '/labels.json', 'w') as outfile:
        json.dump(label_dict, outfile, indent=4, ensure_ascii=False)
    params['sequence_length'] = x_train.shape[1]
    with open(trained_dir + '/trained_parameters.json', 'w') as outfile:
        json.dump(params,
                  outfile,
                  indent=4,
                  sort_keys=True,
                  ensure_ascii=False)
    with open(trained_dir + '/classifier.pickle', 'wb') as outfile:
        pickle.dump(clf, outfile, pickle.HIGHEST_PROTOCOL)
Пример #16
0
###################################################################################      Multilabel Classifier     ######################################################################################

from skmultilearn.problem_transform import ClassifierChain
classifier = ClassifierChain(svm.SVC(decision_function_shape='ovo'))
classifier.fit(train_features,tmp)

p=classifier.predict(test_features)
print(p)



from skmultilearn.adapt import MLkNN
clsfr= MLkNN(k=1)
clsfr.fit(train_features,tmp)

p=clsfr.predict(test_features)
print(p)


###########################################################################      Search for videos with similar tags   ##################################################################################

import urllib
from bs4 import BeautifulSoup
d={}
d[0]="cheering"
d[1]="music"
d[2]="speech"
p=p.todense()
print(p)
for tup in p:
    tupp = np.matrix(tup).tolist()[0]
Пример #17
0
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors=5,p=2)
classifier_knn.fit(features_train, labels_train)

pred_knn = classifier_knn.predict(features_test)
Score_knn = classifier_knn.score(features_test, labels_test)

#------------------------------------------------------------------------------

#using Multilabel KNN Algorithm
from skmultilearn.adapt import MLkNN
mlknn_model = MLkNN(k=20)
# train
mlknn_model.fit(features_train, labels_train)
# predict
predictions = mlknn_model.predict(features_test)

score_mlknn= accuracy_score(labels_test,predictions)

'''
PERFORMING FEATURE SELECTION
'''


from sklearn.decomposition import PCA
pca = PCA(n_components=388)
fit_train = pca.fit(features_train)
fit_test = pca.fit(features_test)
features_train_new = pca.transform(features_train)
features_test_new = pca.transform(features_test)
Пример #18
0
    end_time = time.time()
    print('Klasifikator obucen i sacuvan za: ', end_time-start_time, 's')

if cf is None:
    cf = joblib.load(cf_name)
if vec is None:
    vec = joblib.load(vec_name)
if genres is None:
    genres = joblib.load(genres_name)

yes_no = 'yes'

while(yes_no != 'no'):

    film = input('Unesite opis filma: ')
    film_rep = vec.transform([film])
    predicted = cf.predict(film_rep)

    res = ''

    print(predicted[0, :].toarray()[0])

    for genre, prediction in zip(genres, predicted[0, :].toarray()[0]):
        if prediction == 1:
            res += genre + ', '

    print(res[:-2])

    yes_no = input('Da li zelite jos filmova [yes/no]: ')
Пример #19
0
        for j in range(y_num):
            temp = 0
            for t in range(neigs.shape[0]):
                temp = temp + neigs[t][j + 1]
            if ph[j] * peh1[j, temp] > ph_[j] * peh0[j, temp]:
                predict.append(1)
            else:
                predict.append(0)
        predicts.append(predict)
    predicts = np.array(predicts)
    return predicts


data = pickle.load(open('datasets.pickle', 'rb'))
#得到训练数据X,和标签类别Y
X = data[0]
Y = data[1]

predict = mlknn(X, X, 8, 5, Y)
print(predict)
print(accuary(predict, Y))

ml = MLkNN(k=8)
ml.fit(X, Y)
p = ml.predict(X)
print(accuary(p, Y))

kn = KNeighborsClassifier(n_neighbors=8)
kn.fit(X, Y)
pp = kn.predict(X)
print(accuary(p, Y))
Пример #20
0
            batch_pred_y=session.run(y_last, feed_dict={x:batch_x_emb, sequence_lengths:[sequence_length]*batch_size})
            train_x_fit[step*batch_size : step*batch_size+batch_size]=batch_pred_y
            train_y_fit[step*batch_size : step*batch_size+batch_size]=batch_y
        clf=MLkNN(k=4)
        clf.fit(X=train_x_fit, y=train_y_fit)
        # dev stage
        batches_dev=batch_yield(dev_x, dev_y, batch_size, word2id, label_dict, sequence_length, shuffle=False)
        total_batches_dev=len(dev_x)//batch_size
        dev_x_fit=np.zeros([batch_size*total_batches_dev, n_classes])
        dev_y_fit=np.zeros([batch_size*total_batches_dev, n_classes])
        for step, (batch_dev_x, batch_dev_y) in enumerate(batches_dev):
            batch_dev_x_emb=session.run(word_embeddings, feed_dict={input_ids:batch_dev_x})
            batch_dev_pred_y=session.run(y_last, feed_dict={x:batch_dev_x_emb, sequence_lengths:[sequence_length]*batch_size})
            dev_x_fit[step*batch_size : step*batch_size+batch_size]=batch_dev_pred_y
            dev_y_fit[step*batch_size : step*batch_size+batch_size]=batch_dev_y
        dev_preds=clf.predict(dev_x_fit)
        dev_preds=dev_preds.toarray()
        base_y=dev_preds+dev_y_fit
        acc=float(np.sum(base_y==2))/float(np.sum(base_y==1)+np.sum(base_y==2))
        precision=float(np.sum(base_y==2))/float(np.sum(dev_preds==1))
        recall=float(np.sum(base_y==2))/float(np.sum(dev_y_fit==1))
        f1=2*precision*recall/(precision+recall)
        print('----------- Epoch {} -------------'.format(epoch+1))
        print('Accuracy\tPrecision\tRecall\tF1 measure')
        print(str(acc)+'\t'+str(precision)+'\t'+str(recall)+'\t'+str(f1))
        save_path=saver.save(session, model_path)
        

'''
## Make predictions
test_data=read_data('data_path/labeled_text_test2.csv')
Пример #21
0
start = time.time()

from scipy.sparse import csr_matrix, lil_matrix
from skmultilearn.adapt import MLkNN
x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
x_test = lil_matrix(x_test).toarray()
classifier = MLkNN(k=4)

# train
from skmultilearn.adapt import BRkNNbClassifier

classifier = BRkNNbClassifier(k=6)
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ", accuracy_score(y_test, predictions))
print("\n")
print("F1 = ", f1_score(y_test, predictions, average='micro'))
print("\n")

print("Jaccard = ", jaccard_similarity_score(y_test, predictions))
print("\n")

print("Precision = ", precision_score(y_test, predictions, average='micro'))
print("\n")

print("Recall = ", recall_score(y_test, predictions, average='micro'))
print("\n")
Пример #22
0
#feature_x = pkl.load(open('features_for_classification.pkl'))
#classifier = BinaryRelevance(GaussianNB())


Keys_Train = random.sample(ent2type.keys(),10000)
Keys_Test = list(ent2type.keys())
[Keys_Test.remove(val) for val in Keys_Train]
X_Train = [feature_x[key] for key in Keys_Train]
X_Test = [feature_x[key] for key in Keys_Test]
Y_Train = generate_labels(Keys_Train)
Y_Test = generate_labels(Keys_Test)
print('HEERE 1')
classifier.fit(np.array(X_Train), np.array(Y_Train))
print('HEERE 2')    
predictions = classifier.predict(np.array(X_Test))

print(accuracy_score(np.array(Y_Test),predictions))

preds = predictions.toarray()

def accuracy(input):
    data = input[0]
    true = input[1]
    size = len(data)
    FP = TP = FN = TN = 0
    for i in xrange(size):
        if true[i] == True:
            if data[i] == True:
                TP += 1
            else:
Пример #23
0
    # print(images_resized)

    # convert images to numpy array
    x_multidim = np.array([np.array(image) for image in images_resized])
    #print(x_multidim.shape)

    # flatten the numpy array
    return x_multidim.reshape(n_samples, -1)
    # print(x.shape)
    # print(x)


Xtrain = imageprep(dir + 'tmp_images/*.jpg')

Xval = imageprep(dir + 'val_images/*.jpg')

i, ytrain = multi_label(dir + "train_subset.json")
i, yval = multi_label(dir + "validation.json")

ytrain, yval = ytrain[:1000], yval[:1000]

classifier = MLkNN(k=10)

classifier.fit(Xtrain, ytrain)

predictions = classifier.predict(Xval)
print(predictions)

# acc = accuracy_score(yval, predictions)
# print("Accuracy on test set: {}".format(acc))
Пример #24
0
            accuracy_arr.append(sub_accuracy)
            recall_arr.append(recall)
            precision_arr.append(precision)
            f1_arr.append(f1)

        elif mlknn_with_grid != True and grid != True:

            X_train = lil_matrix(X_train).toarray()
            y_train = lil_matrix(y_train).toarray()
            X_test = lil_matrix(X_test).toarray()
            y_test = lil_matrix(y_test).toarray()

            k = [2, 3, 4, 5, 6, 7, 8, 9, 10]
            for i in k:
                model = MLkNN(k=i, s=0.2).fit(X_train, y_train)
                hamming[i].append(hamming_loss(y_test, model.predict(X_test)))
                Subset_Accuracy[i].append(
                    accuracy_score(y_test, model.predict(X_test)))
                Precision[i].append(
                    precision_score(y_test,
                                    model.predict(X_test),
                                    average="micro"))
                Recall[i].append(
                    recall_score(y_test,
                                 model.predict(X_test),
                                 average='micro'))
                f1[i].append(
                    f1_score(y_test, model.predict(X_test), average='micro'))

    if mlknn_with_grid != True and grid != True:
        all = [hamming, Subset_Accuracy, Recall, Precision, f1]
Пример #25
0
def MultiLabel_class(temp_interval):
    # 把数据集划为测试集和训练集
    X_train, X_test, y_train, y_test = train_test_split(
        data, label, test_size=temp_interval, random_state=17)
    save_folder = open(save_path, "a+")
    save_folder.write("测试集占比:" + str(temp_interval) + "\n")
    # # 方法1:x_train 对每一个单标签.
    # # with a gaussian naive bayes base classifier
    # classifier = BinaryRelevance(GaussianNB())
    # # train
    # classifier.fit(X_train, y_train)
    # # predict
    # predictions = classifier.predict(X_test)
    # print("方法一:", accuracy_score(y_test, predictions))
    # print("方法一:", np.mean(predictions == y_test))

    # 方法2:OneVsRest 想要分类的作为正类,其他的类作为反类。
    # 分类器使用1对多,SVM用linear kernel
    clf1 = OneVsRestClassifier(SVC(kernel='linear', gamma='auto'), n_jobs=-1)
    # clf1 = OneVsRestClassifier(SVC(kernel='poly', gamma='auto'), n_jobs=-1)
    # 训练
    clf1.fit(X_train, y_train)
    # 输出预测的标签结果
    predict_class = clf1.predict(X_test)
    # 准确率,预测的结果和实际的结果
    save_folder.write("OneVsRest(accuracy_score):" +
                      str(clf1.score(X_test, y_test)) + "\n")
    save_folder.write("OneVsRest(mean):" +
                      str(np.mean(predict_class == y_test)) + "\n")

    # # 方法3:powerset:随机抽取k个label,将这k类(有2^k种组合)转化为单标签.
    # classifier = LabelPowerset(GaussianNB())
    # # train
    # classifier.fit(X_train, y_train)
    # # predict
    # predictions = classifier.predict(X_test)
    # print("方法三(accuracy_score):", accuracy_score(y_test, predictions))
    # print("方法三(mean):", np.mean(predictions == y_test))

    # 方法4:Adapted Algorithm:多标签KNN算法MLKNN
    classifier = MLkNN(k=20)
    # train
    classifier.fit(X_train, y_train)
    # predict
    predictions = classifier.predict(X_test)
    save_folder.write("MLKNN(accuracy_score):" +
                      str(accuracy_score(y_test, predictions)) + "\n")
    save_folder.write("MLKNN(mean):" + str(np.mean(predictions == y_test)) +
                      "\n")

    # # 方法5:分类器链
    # classifier = ClassifierChain(GaussianNB())
    # # train
    # classifier.fit(X_train, y_train)
    # # predict
    # predictions = classifier.predict(X_test)
    # print("方法五:", accuracy_score(y_test, predictions))
    # print("方法五:", np.mean(predictions == y_test))

    # np.save(save_path, predict_class)
    # 准确率,预测的结果和实际的结果
    # print(np.mean(predict_class == y_test))
    save_folder.close()
Пример #26
0
# l = [200]
# l = [likely_k]
# l = [70, 80, 90, 100, 500, 1000, 2000, 3000, 4000, 5600]
best_clf = None
lowest_hl = float('inf')
best_k = float('inf')
for k in l:
    print(25*'=')
    print('k = ' + str(k))
    clf = MLkNN(k)

    # train
    clf.fit(x_train, y_train)

    # predict
    predictions = clf.predict(x_dev)

    predictions = predictions.todense()
    print('all match:', np.sum(np.all(predictions == y_dev, axis=1)) / len(y_dev))
    print('at least one match:', (np.sum(np.all(predictions - y_dev <= 0, axis=1))-np.sum(np.all(predictions== 0, axis=1))) / len(y_dev))
    print('binary :', np.mean(predictions == y_dev))
    hl = hamming_loss(y_dev, predictions)
    print('Hamming Loss:', hamming_loss(y_dev, predictions))
    if hl < lowest_hl:
        lowest_hl = hl
        best_clf = clf
        best_k = k
    

# import sys
# np.set_printoptions(threshold=sys.maxsize)
# splitting the data to training and testing data set 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30,random_state=2) 


# transforming the data 
X_train_tfidf = vetorizar.transform(X_train) 
X_val_tfidf = vetorizar.transform(X_val) 
X_test1_tfidf = vetorizar.transform(X_test1) 


# using Multi-label kNN classifier 
mlknn_classifier = MLkNN() 
mlknn_classifier.fit(X_train_tfidf, y_train) 

#prediction
predicted = mlknn_classifier.predict(X_val_tfidf)


print(f1_score(y_val, predicted,average='micro'))

--------test------------------------------------------------------------------------------




predicts = mlknn_classifier.predict(X_test1_tfidf)

k=pd.DataFrame(predicts.todense())
ss[TARGET_COLS] = k
ss.to_csv(r"C:\Users\Sheeja Ayoob\Desktop\hacklive_NLP_sub7.csv", index = False)
--------------------------------------------------------------------------------------------