Exemplo n.º 1
0
def main():
    a = read_in()
    m = joblib.load('./ML/model-data/model1.pkl')

    xtrain = np.load('./ML/model-data/xtrain.npy', allow_pickle=True)
    xtrain = pd.Series(xtrain)

    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
    xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)

    genre_data = np.load('./ML/model-data/genre_data.npy', allow_pickle=True)
    genre_data = pd.Series(genre_data)
    multilabel_binarizer = MultiLabelBinarizer()
    multilabel_binarizer.fit(genre_data)

    a = clean_text(a)
    a = remove_stopwords(a)
    a = pd.Series(np.array([a]))

    xval_tfidf = tfidf_vectorizer.transform(a)
    y_pred = m.predict(xval_tfidf)
    ans = multilabel_binarizer.inverse_transform(y_pred)
    y_val_predicted_probabilites_tfidf = m.predict_proba(xval_tfidf)
    if len(ans[0]) == 0:
        max_idx = np.argmax(y_val_predicted_probabilites_tfidf)
        y_pred[0][max_idx] = 1
        ans = multilabel_binarizer.inverse_transform(y_pred)

    final_ans = []
    for x in ans[0]:
        final_ans.append(x)
    print(final_ans)
Exemplo n.º 2
0
def run():
    ##############################################################
    ## local variable setup
    mlb = MultiLabelBinarizer(classes=mf_labels)
    mlb.fit(mf_labels)
    is_training = True
    is_validating = True
    is_testing = True

    # lm = LinearSVC(dual=True, class_weight='balanced', random_state=0, verbose=2, max_iter=500, tol=1e-4)
    lm = LinearSVC(dual=False, class_weight='balanced', verbose=2)
    classifier = OneVsRestClassifier(lm, n_jobs=-1)

    ###########################################################
    ## Start training
    if is_training:
        X = np.load('X_train.npy', mmap_mode='r')
        Y = np.load('y_train.npy', mmap_mode='r')
        print("### Classifier fitting X size: {0} y size: {1}".format(
            len(X), len(Y)))
        classifier.fit(X, Y)
        # joblib.dump(classifier, current_classifier_name)
    ###########################################################
    ## Prediction
    if is_validating:
        print("## Prediction")
        # classifier = joblib.load(current_classifier_name)
        X_val = np.load('X_val.npy', mmap_mode='r')
        y_val = np.load('y_val.npy', mmap_mode='r')
        X_id = np.arange(1, 9897)
        predicted = classifier.predict(X_val)
        pred_labels = mlb.inverse_transform(predicted)
        groud_truth = mlb.inverse_transform(y_val)
        for id, predicted, gt in zip(X_id, pred_labels, groud_truth):
            print('ID: {0} =>\r\nPredicted: {1} \r\nGround truth: {2}'.format(
                str(id), ', '.join(str(k) for k in predicted),
                ', '.join(str(k) for k in gt)))
    ###########################################################
    ## Test
    if is_testing:
        print('## Test')
        # classifier = joblib.load(current_classifier_name)
        X_test = np.load('X_test.npy', mmap_mode='r')
        predicted = classifier.predict(X_test)
        pred_labels = mlb.inverse_transform(predicted)
        to_write = 'image_id,label_id\r\n'
        for idx, predicted in enumerate(pred_labels):
            i = idx + 1
            to_write += '{0},{1}\r\n'.format(
                str(i), ' '.join(str(k) for k in predicted))
        f = open(test_file_name, 'w')
        f.write(to_write)
        f.close()
    return classifier
Exemplo n.º 3
0
def main():

    image_data_fourier, images = load_image_data()
    weather_data = load_weather_data()
    weather_data = transform_weather(weather_data)

    model = make_pipeline(
        StandardScaler(),     # Around 64%
        #MinMaxScaler(),        # Around 63%
        #Normalizer(),           # Around 65%
        PCA(120),
        OneVsRestClassifier(KNeighborsClassifier(n_neighbors=7))
        #MLPClassifier(50)
        #KNeighborsClassifier(n_neighbors=5)  #7  100 == 68    5 120 == 68.5    9 340 == 69  13 300 == 68.8
        #SVC(kernel='linear', C=1)  # Does not support multilabel, also runs endlessly
    )

    join = image_data_fourier[['datetime', 'filename']]
    weather_data = pd.merge(weather_data, join, on='datetime')

    train_weather = weather_data[~weather_data['weather'].isnull()]

    files = train_weather['filename']

    train_image = [ndimage.imread(directory_i + file, mode='L') for file in files]
    train_image = np.array(train_image)
    train_image = np.reshape(train_image, [train_image.shape[0], train_image.shape[1] * train_image.shape[2]])



    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(train_weather['weather'])

    res = mlb.classes_

    X_train, X_test, y_train, y_test = train_test_split(train_image, y)
    model.fit(X_train, y_train)



    print(classification_report(y_test, model.predict(X_test), target_names=res))

    tmp = pd.DataFrame()

    tmp['real'] = mlb.inverse_transform(y_test)
    tmp['fake'] = mlb.inverse_transform(model.predict(X_test))

    print(accuracy_score(y_test, model.predict(X_test)))
    tmp.to_csv('image_data.csv')
    #weather_data = weather_data.groupby('weather').mean()

    #images.to_csv('weather_data.csv')
    print("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 4
0
def supervised(train,
               test,
               train_tfidf_matrix,
               test_tfidf_matrix,
               n_classes,
               init_C=10,
               metric=False,
               grid=True):
    print("Method: supervised(train,test,train_tfidf_matrix,"
          "test_tfidf_matrix,init_C=10,probability=True,"
          "metric=False,grid=True)")
    from sklearn.preprocessing import MultiLabelBinarizer
    from sklearn.svm import SVC
    from sklearn.multiclass import OneVsRestClassifier
    # from scipy.stats import randint as sp_randint

    mlb = MultiLabelBinarizer()
    train_labels = [vals["classes"] for id, vals in train.items()]
    train_labels_bin = mlb.fit_transform(train_labels)

    print("\nAlgorithm: \t \t \t SVM")
    SVM = OneVsRestClassifier(SVC(kernel='linear', C=init_C, probability=True))
    if grid:
        print("Performing grid search...")
        SVM_params = [
            {
                'estimator__C': [10000, 1000, 100, 10, 1]
            },
        ]
        # SVM_params = {'estimator__C': sp_randint(1, 10000)}
        SVM_grid = grid_search(SVM, SVM_params, train_tfidf_matrix,
                               train_labels_bin)
        SVM = OneVsRestClassifier(
            SVC(kernel='linear',
                C=SVM_grid['params']['estimator__C'],
                probability=True))

    SVM_fit = SVM.fit(train_tfidf_matrix, train_labels_bin)
    SVM_pred = SVM_fit.predict(test_tfidf_matrix)
    SVM_proba = SVM_fit.predict_proba(test_tfidf_matrix)

    if metric:
        result = OrderedDict()
        test_labels = [vals["classes"] for id, vals in test.items()]
        mm.accuracy_multi(test_labels, mlb.inverse_transform(SVM_pred),
                          n_classes)
        result["SVM_metric"] = mm.sklearn_metrics(
            mlb.fit_transform(test_labels), SVM_pred)
        return result, mlb.inverse_transform(SVM_pred), SVM_proba
    return None, mlb.inverse_transform(SVM_pred), SVM_proba
Exemplo n.º 5
0
def classify(x_train,y_train,x_test,y_test,test_size,max_labels,threshold):
    from sklearn.preprocessing import MultiLabelBinarizer
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.svm import SVC
    from sklearn.svm import LinearSVC
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.cross_validation import train_test_split
    from sklearn import metrics
    import numpy
        
    # binarize the labels
    mlb = MultiLabelBinarizer()
    y_train_binarized = mlb.fit_transform(y_train)
    
    # train/test split
    #corpus_tfidf_vectors, labels_binarized = shuffle(corpus_tfidf_vectors, labels_binarized)
    #x_train, x_test, y_train, y_test = train_test_split(corpus_tfidf_vectors, labels_binarized, test_size=test_size, random_state=1)
    
    # classify
    #cls = OneVsRestClassifier(LogisticRegression(class_weight='auto'))
    #cls = OneVsRestClassifier(LogisticRegression())
    #cls = OneVsRestClassifier(MultinomialNB(alpha=0.01))
    #cls = OneVsRestClassifier(SVC(kernel='linear',probability=True,max_iter=1000))
    cls = OneVsRestClassifier(LinearSVC())
    cls.fit(x_train, y_train_binarized)
    pred_proba = 1/(1+numpy.exp(-1*cls.decision_function(x_train)))
    # evaluate
    y_pred = mlb.inverse_transform(get_max_n_pred(pred_proba, max_labels,threshold))
    result = 'threshold: {0}, precision: {1}, recall: {2}, f1: {3}'.format(threshold,metrics.precision_score(y_train, y_pred, average='micro'),metrics.recall_score(y_train, y_pred, average='micro'),metrics.f1_score(y_train, y_pred, average='micro'))    
    print result
Exemplo n.º 6
0
class DFMultiLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        self.model          = MultiLabelBinarizer(**kwargs)
        self.transform_cols = None
        
    def fit(self, y):
        self.transform_cols = [x for x in y.columns]
        self.model.fit(y[self.transform_cols].values)

        return self
    
    def transform(self, y):
        if self.transform_cols is None:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        new_y = pd.DataFrame(
            self.model.transform(y[self.transform_cols].values),
            columns=[f'MLB_{x}' for x in self.model.classes_]
        )

        return new_y
    
    def fit_transform(self, y):
        return self.fit(y).transform(y)
    
    def inverse_transform(self, y):
        if self.transform_cols is None:
            raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.")

        new_X   = pd.DataFrame(self.model.inverse_transform(y.values), columns=self.transform_cols)

        return new_X
Exemplo n.º 7
0
class MyMultiLabelBinarizer(TransformerMixin):
    """
    Wrap MultiLabelBinarizer so it can be used in pipeline.
    See https://stackoverflow.com/questions/46162855/fit-transform-takes-2-positional-arguments-but-3-were-given-with-labelbinarize
     for problem explanation.
    """
    def __init__(self, *args, **kwargs):

        self.classes = [
            'agreement/disagreement', 'certainty', 'contrariety',
            'hypotheticality', 'necessity', 'prediction',
            'source of knowledge', 'tact/rudeness', 'uncertainty', 'volition'
        ]

        self.encoder = MultiLabelBinarizer(classes=self.classes,
                                           *args,
                                           **kwargs)

    def fit(self, y, *_):
        self.encoder.fit(y)
        return self

    def transform(self, y, *_):
        yt = self.encoder.transform(y)
        return yt

    def inverse_transform(self, yt):
        y = self.encoder.inverse_transform(yt)
        return y
Exemplo n.º 8
0
def main():
    #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"])
    sets = select_sets_by_tag(20, 4, tag_names)
    #sets = random_select_sets(30,6)
    train_tags = fetch_tags(sets["train"])
    train_texts = id_to_filename(sets["train"])  #txt_to_list(sets["train"])
    #vectorize
    count_vect = CountVectorizer(stop_words='english',
                                 encoding="utf-16",
                                 input="filename")
    X_train_counts = count_vect.fit_transform(train_texts)

    #tf-idf transformation
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    #process tags
    mlb = MultiLabelBinarizer()
    processed_train_tags = mlb.fit_transform(train_tags)
    #rint(processed_train_tags)
    #classifier
    #clf = OneVsRestClassifier(MultinomialNB())
    clf = OneVsRestClassifier(LinearSVC())
    clf.fit(X_train_tfidf, processed_train_tags)
    print("classes:{}".format(clf.classes_))
    #process test set

    test_texts = id_to_filename(sets["test"])  #txt_to_list(sets["test"])
    X_test_counts = count_vect.transform(test_texts)
    #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts)))
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    predicted_tags = clf.predict(X_test_tfidf)
    predicted_tags_readable = mlb.inverse_transform(predicted_tags)
    test_tags_actual = fetch_tags(sets["test"])
    predicted_probs = clf.decision_function(X_test_tfidf)
    #predicted_probs = clf.get_params(X_test_tfidf)
    class_list = mlb.classes_
    report = metrics.classification_report(mlb.transform(test_tags_actual),
                                           predicted_tags,
                                           target_names=class_list)
    print(report)
    #retrieve top 30% for each class
    top_percentage = 30
    threshold_index = int(len(sets["test"]) * (top_percentage / 100.0))
    threshold_vals_dic = {}
    threshold_vals = []
    num_classes = len(class_list)
    for i in range(num_classes):
        z = [predicted_probs[j, i] for j in range(len(sets["test"]))]
        z.sort(reverse=True)
        threshold_vals_dic[class_list[i]] = z[threshold_index]
        threshold_vals.append(z[threshold_index])
    print(threshold_vals_dic)

    print_predictions(sets["test"],
                      predicted_tags_readable,
                      class_list,
                      class_probablities=predicted_probs,
                      threshold_vals=threshold_vals)
Exemplo n.º 9
0
class ACMClassificator(BaseACMClassificator):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
        self.mlb = MultiLabelBinarizer()
        self.classificator = OneVsRestClassifier(ExtraTreeClassifier(criterion="gini",
                                                                     max_depth=None,
                                                                     min_samples_split=2,
                                                                     min_samples_leaf=1,
                                                                     min_weight_fraction_leaf=0.,
                                                                     max_features="auto",
                                                                     max_leaf_nodes=None,
                                                                     class_weight=None),
                                                 n_jobs=-1
                                                 )

    def _prepare_problems(self, problems):
        return self.vectorizer.transform([p.statement for p in problems])

    def fit(self, problems, tags):
        nltk.download('punkt', quiet=True)
        self.vectorizer.fit([p.statement for p in problems])
        mat = self._prepare_problems(problems)
        self.mlb = self.mlb.fit(tags)
        self.classificator.fit(mat.toarray(), self.mlb.transform(tags))

    def predict(self, problems):
        mat = self._prepare_problems(problems)
        predicted = self.classificator.predict(mat.toarray())
        return self.mlb.inverse_transform(predicted)
Exemplo n.º 10
0
def distribution_to_tuples(predictions, threshold, at_least_one_hot=True):
    """
    Convert a distribution to k-hot tuples using the given threshold.

    :param predictions: predictions, 2-dim numpy matrix (samples, distribution)
    :param threshold: per-class thresholds for k-hot mapping (binarization)
    :param at_least_one_hot: ensure that the relatively highest prediction is chosen, if no prediction is greater
    than its threshold
    :return: list of tuples, each tuple contains class indices
    """
    threshold_normalized_predictions = predictions / threshold
    k_hot_predictions = np.where(threshold_normalized_predictions >= 1., 1.,
                                 0.)

    classes = predictions.shape[1]
    binarizer = MultiLabelBinarizer(classes=[c for c in range(classes)])
    binarizer.fit([[c] for c in range(classes)])
    binarized_prediction_tuples = binarizer.inverse_transform(
        k_hot_predictions)

    # make sure we have at least one prediction
    if at_least_one_hot:
        ensure_at_least_one_prediction(binarized_prediction_tuples,
                                       threshold_normalized_predictions)

    return binarized_prediction_tuples
Exemplo n.º 11
0
def multilabel(weather,weatherTest):
    
    categorias =["clase","humedad","nieve","nubes","nubes-precipitacion","otros","precipitacion","temperatura","viento"]
    pipeline = Pipeline([
        ('vectorize', CountVectorizer()),
        ('tf_idf', TfidfTransformer(norm='l2')),
        # play with the parameters and check the model size
        ('select', SelectPercentile(chi2, percentile=50)),
        ('clf', OneVsRestClassifier(SGDClassifier(loss='modified_huber')))
    ])
    multi_labels = [
        ["humedad"], ["nieve"], ["nubes"], ["nubes","precipitación"], ["otros"],
        ["precipitacion"], ["temperatura"], ["viento"],
       
    ]

    mlb = MultiLabelBinarizer().fit(weather.clase)
    mlb_labels = mlb.transform(weather.clase)
   
    print(mlb_labels)
    clf = pipeline.fit(weather.frase, mlb_labels)
    print("classifier has %s bytes" % len(pickle.dumps(pipeline.named_steps['clf'])))
    
  
    predicted = pipeline.predict(weatherTest.frase)
    print(predicted)
    #print(np.mean(predicted == weatherTest.clase))
    #print(metrics.classification_report(weatherTest.clase, predicted))
    all_labels = mlb.inverse_transform(predicted)
    print(all_labels)
    for item, labels in zip(weatherTest.frase, all_labels):
        print('%s => %s' % (item, ', '.join(labels)))
def main():
    """ Loads the model from the checkpoint dir
    as specified in the given config file.
    Calls the prediction function to save the
    prediction csv file to the checkpoint dir.
    """
    # capture the config path from the run arguments
    # then process the json configuration file
    try:
        args = get_args()
        config_array = [process_config(x) for x in args.config.split(" ")]
        check_array = args.checkpoint_nb.split(" ")
        cwd = os.getenv("EXP_PATH")
        if args.outfile_multiple:
            outfile = os.path.join(cwd, args.outfile_multiple + '.csv')
        else:
            outfile = os.path.join(cwd, 'prediction.csv')
    except Exception:
        print("missing or invalid arguments")
        raise

    # not needed just to question n
    testIterator = DataTestLoader(config_array[0])
    probas = np.zeros((len(config_array), testIterator.n, 28))
    i = 0
    for config, check in zip(config_array, check_array):
        # create tensorflow session
        sess = tf.Session()
        # create your data generator
        # here config file used for init does not matter
        testIterator = DataTestLoader(config)
        # create an instance of the model you want
        try:
            ModelInit = all_models[config.model]
            model = ModelInit(config)
        except AttributeError:
            print("The model to use is not specified in the config file")
            exit(1)

        # load model if exists
        model.load(sess, check)
        # here you predict from your model
        predictor = Predictor(sess, model, config)
        probas[i, :, :] = predictor.predict_probas(testIterator)
        print('processed {} model'.format(model))
        i += 1
        tf.reset_default_graph()
    probas = np.mean(probas, axis=0)
    print(np.shape(probas))
    one_hot_pred = get_pred_from_probas_threshold(probas)
    bin = MultiLabelBinarizer(classes=np.arange(28))
    bin.fit([[1]])  # needed for instantiation of the object
    pred = bin.inverse_transform(one_hot_pred)
    predicted_labels = [
        ' '.join([str(p) for p in sample_pred]) for sample_pred in pred
    ]
    print(np.shape(predicted_labels))
    testIterator.result['Predicted'] = predicted_labels
    testIterator.result = testIterator.result.sort_values(by='Id')
    testIterator.result.to_csv(outfile, index=False)
Exemplo n.º 13
0
    def on_train_end(self, logs={}):
        if self.test_generator:
            print('Training done. Running predictions...')
            best_model = load_model(self.save_path)
            classes = pd.read_csv(paths['dummy']['csv']).columns

            preds = best_model.predict_generator(self.test_generator,
                                                 use_multiprocessing=True,
                                                 workers=8,
                                                 verbose=1)
            preds = preds > .5

            print('Converting labels...')
            mlb = MultiLabelBinarizer(classes=classes)
            mlb.fit(None)  # necessary, won't actually do anything
            sparse_preds = mlb.inverse_transform(preds)

            submission_list = []
            for i, p in enumerate(sparse_preds, start=1):
                labels = ' '.join(p)
                submission_list.append([i, labels])

            submission_path = join(paths['results'],
                                   '{}-submission.csv'.format(self.save_fname))
            print('Saving predictions to {}'.format(submission_path))
            columns = ['image_id', 'label_id']
            pd.DataFrame(submission_list, columns=columns) \
                        .to_csv(submission_path, index=False)
Exemplo n.º 14
0
def tag_recommendation():

    # Appeler les Inputs de la page HTML dashboard
    question = request.form['question']  #request.args.get('question')
    tags_text = ''
    if question is not None:

        question = str(question)
        question_tag = preprocessing(question)
        question_tag_df = pd.DataFrame([question_tag], columns=['question'])
        test_input_df = pd.concat([question_tag_df, input_df_tags_500],
                                  ignore_index=True)
        question_input = test_input_df['question']
        vectorizer = TfidfVectorizer(tokenizer=tokenize,
                                     stop_words=stop_words,
                                     max_features=355)
        X_tfidf = vectorizer.fit_transform(question_input).toarray()
        feature_names = vectorizer.get_feature_names()
        X_test_question = pd.DataFrame(X_tfidf)
        X_test_question = X_test_question.iloc[0:1, :]
        X_test_question.columns = feature_names

        tags_num = pipeline.predict(X_test_question)
        mlb = MultiLabelBinarizer(classes=sorted(input_tags_500))
        mlb.fit(input_tags_500)
        tags_text = pd.concat(
            [pd.Series(mlb.inverse_transform(tags_num), name='tags_num')],
            axis=1)
        tags_text = str(tags_text.values.tolist()).strip('[()]')
        tags_text

    return render_template('recommendation.html', tags=tags_text)
Exemplo n.º 15
0
class MultiHotEncoder(BaseEncoder):
    def __init__(self, is_target=False):
        super().__init__(is_target)
        self._binarizer = MultiLabelBinarizer()
        self._seen = set()

    @staticmethod
    def _clean_col_data(column_data):
        column_data = [(arr if arr is not None else []) for arr in column_data]
        column_data = [[str(x) for x in arr] for arr in column_data]
        return column_data

    def prepare(self, column_data, max_dimensions=100):
        column_data = self._clean_col_data(column_data)
        self._binarizer.fit(column_data + [('None')])
        for arr in column_data:
            for x in arr:
                self._seen.add(x)
        self._prepared = True

    def encode(self, column_data):
        column_data = self._clean_col_data(column_data)
        data_array = self._binarizer.transform(column_data)
        return torch.Tensor(data_array)

    def decode(self, vectors):
        # It these are logits output by the neural network, we need to treshold them to binary vectors
        vectors = np.where(vectors > 0, 1, 0)
        words_tuples = self._binarizer.inverse_transform(vectors)
        return [list(w) for w in words_tuples]
Exemplo n.º 16
0
def supervised2(params, pkl_file=False):
    print("Method: supervised(train,test,train_tfidf_matrix,"
          "test_tfidf_matrix,init_C=10,probability=True,"
          "metric=False,grid=True)")
    from sklearn.preprocessing import MultiLabelBinarizer
    from sklearn.svm import SVC
    from sklearn.multiclass import OneVsRestClassifier
    # from scipy.stats import randint as sp_randint

    train = params["train"]
    test = params["test"]
    train_tfidf_matrix = params["train_tfidf_matrix"]
    test_tfidf_matrix = params["test_tfidf_matrix"]
    n_classes = params["n_classes"]
    init_C = params["init_C"]
    metric = params["metric"]

    mlb = MultiLabelBinarizer()
    train_labels = [vals["classes"] for id, vals in train.items()]
    train_labels_bin = mlb.fit_transform(train_labels)

    print("\nAlgorithm: \t \t \t SVM")
    SVM = None
    if pkl_file:
        if os.path.isfile(pkl_file):
            SVM = load_pickle(pkl_file)
    else:
        SVM = OneVsRestClassifier(
            SVC(kernel='linear', C=init_C, probability=True))
        pkl_file = "SVM"
        save_pickle(SVM, pkl_name, tag=False)

    SVM_fit = SVM.fit(train_tfidf_matrix, train_labels_bin)
    SVM_pred = SVM_fit.predict(test_tfidf_matrix)
    SVM_proba = SVM_fit.predict_proba(test_tfidf_matrix)

    if metric:
        result = OrderedDict()
        test_labels = [vals["classes"] for id, vals in test.items()]
        mm.accuracy_multi(test_labels, mlb.inverse_transform(SVM_pred),
                          n_classes)
        result["SVM_metric"] = mm.sklearn_metrics(
            mlb.fit_transform(test_labels), SVM_pred)
        return result, mlb.inverse_transform(
            SVM_pred), SVM_proba, SVM, pkl_file
    return None, mlb.inverse_transform(SVM_pred), SVM_proba, SVM, pkl_file
Exemplo n.º 17
0
def translate_to_labels(root, y_pred):
    path = os.path.join(root, 'data', 'Y.csv')
    df = pd.read_csv(path, header=None, names=['labels'])
    df['labels'] = df['labels'].apply(lambda x: x.split())
    mlb = MultiLabelBinarizer()
    mlb.fit_transform(df['labels'])

    return mlb.inverse_transform(y_pred)
Exemplo n.º 18
0
class MultiLabelClassifier:
    """
    Helper class for training and evaluating multi-label classifiers on movie genres
    Classifier can predict multiple genres for a given movie
    """
    def __init__(self, vectorizer=None, classifier=None):
        self.vectorizer = vectorizer
        self.classifier = classifier
        self.encoder = MultiLabelBinarizer()
        self.trained = False

    def save_clf(self, file_name):
        if not self.trained:
            raise Exception("Classifier needs to be trained first")
        with open(file_name, 'wb') as output:
            pickle.dump(self.vectorizer, output, pickle.HIGHEST_PROTOCOL)
            pickle.dump(self.classifier, output, pickle.HIGHEST_PROTOCOL)
            pickle.dump(self.encoder, output, pickle.HIGHEST_PROTOCOL)

    def load_clf(self, file_name):
        with open(file_name, 'rb') as input:
            self.vectorizer = pickle.load(input)
            self.classifier = pickle.load(input)
            self.encoder = pickle.load(input)
            self.trained = True

    def prepare_data(self, label_col, df: pd.DataFrame):
        y = self.encoder.fit_transform(df[label_col])
        return train_test_split(df.drop(columns=[label_col]),
                                y,
                                test_size=0.2,
                                random_state=9)

    def train(self, X, Y):
        Xtrain = self.vectorizer.fit_transform(X)
        self.classifier.fit(Xtrain, Y)
        self.trained = True

    def predict_one(self, x) -> List[str]:
        """
        Given one movie as a single DataFrame, predict possible genres
        """
        if not self.trained:
            raise Exception("Classifier needs to be trained first")
        xtest = self.vectorizer.transform(x)
        ytest = self.classifier.predict(xtest)
        return list(self.encoder.inverse_transform(ytest)[0])

    def evaluate(self, X, Y):
        if not self.trained:
            raise Exception("Classifier needs to be trained first")
        Xval = self.vectorizer.transform(X)
        Ypredict = self.classifier.predict(Xval)
        overlap = np.count_nonzero(Ypredict + Y == 2, axis=1)
        total = overlap.shape[0]
        correct = np.count_nonzero(overlap > 0)
        incorrect = total - correct
        return correct, incorrect
Exemplo n.º 19
0
def get_label_strings_from_tensor(pred_labels_tensor):
    mlb = MultiLabelBinarizer(classes=LABEL_LIST)
    mlb = mlb.fit(None)  #what hte f**k
    pred_labels_cpu = pred_labels_tensor.cpu().numpy()
    pred_labels_str = mlb.inverse_transform(pred_labels_cpu)
    pred_labels = [
        " ".join(pred_labels_str[i]) for i in range(pred_labels_cpu.shape[0])
    ]
    return pred_labels
Exemplo n.º 20
0
class DataProcess(object):  # 特征处理
    def __init__(self, process_type):
        self.process_type = process_type

        if self.process_type == "Binary":  # 二值化处理
            self.processmodule = Binarizer(copy=True, threshold=0.0)
            # 大于 threshold 的映射为1, 小于 threshold 的映射为0

        elif self.process_type == "MinMax":  # 归一化处理
            self.processmodule = MinMaxScaler(feature_range=(0, 1), copy=True)

        elif self.process_type == "Stand":  # 标准化处理
            self.processmodule = StandardScaler(copy=True, with_mean=True, with_std=True)

        elif self.process_type == "Normal":  # 正则化处理
            self.processmodule = Normalizer(copy=True, norm="l2")  # 可选择l1, max ,l2三种

        elif self.process_type == "MultiLabelBinar":   # 多标签二值化处理
            self.processmodule = MultiLabelBinarizer(sparse_output=False)  # 使用其他CRS格式使用True
        else:
            raise ValueError("please select a correct process_type")

    def fit_transform(self, data):
        return self.processmodule.fit_transform(data)

    def fit(self, data):
        self.processmodule.fit(data)

    def transform(self, data):
        self.processmodule.transform(data)

    def set_params(self, params):
        self.processmodule.set_params(**params)

    def get_params(self):
        return self.processmodule.get_params(deep=True)

    def get_classes(self):
        assert self.process_type in {"MultiLabelBinar"}
        return self.processmodule.classes_  # 输出相关的classs有哪些不同的值

    def invser_transform(self, data):
        assert self.process_type in {"MultiLabelBinar", "MinMax", "Stand"}
        return self.processmodule.inverse_transform(data)

    def get_max(self):  # 获取数组中所多有维度上的最大值与最小值
        assert self.process_type in {"MinMax", "Stand"}
        return self.processmodule.data_max_

    def get_min(self):
        assert self.process_type in {"MinMax", "Stand"}
        return self.processmodule.data_min_

    def partial_fit(self):
        # 使用最后的一个缩放函数来在线计算最大值与最小值
        assert self.process_type in {"MinMax", "Stand"}
        return self.processmodule.partial_fit()
Exemplo n.º 21
0
def main():
    #sets = select_by_trait(10,2,tags=["Comedy","Human","Sad","Dark"])
    sets = select_sets_by_tag(20,4,tag_names)
    #sets = random_select_sets(30,6)
    train_tags = fetch_tags(sets["train"])
    train_texts = id_to_filename(sets["train"])#txt_to_list(sets["train"])
    #vectorize
    count_vect = CountVectorizer(stop_words='english', encoding="utf-16", input="filename")
    X_train_counts = count_vect.fit_transform(train_texts)

    #tf-idf transformation
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    #process tags
    mlb = MultiLabelBinarizer()
    processed_train_tags = mlb.fit_transform(train_tags)
    #rint(processed_train_tags)
    #classifier
    #clf = OneVsRestClassifier(MultinomialNB())
    clf = OneVsRestClassifier(LinearSVC())
    clf.fit(X_train_tfidf,processed_train_tags)
    print("classes:{}".format(clf.classes_))
    #process test set

    test_texts = id_to_filename(sets["test"])#txt_to_list(sets["test"])
    X_test_counts = count_vect.transform(test_texts)
    #print("X_test_counts inverse transformed: {}".format(count_vect.inverse_transform(X_test_counts)))
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    predicted_tags = clf.predict(X_test_tfidf)
    predicted_tags_readable = mlb.inverse_transform(predicted_tags)
    test_tags_actual = fetch_tags(sets["test"])
    predicted_probs = clf.decision_function(X_test_tfidf)
    #predicted_probs = clf.get_params(X_test_tfidf)
    class_list = mlb.classes_
    report = metrics.classification_report(mlb.transform(test_tags_actual),predicted_tags,target_names=class_list)
    print(report)
    #retrieve top 30% for each class
    top_percentage = 30
    threshold_index = int( len(sets["test"]) *(top_percentage/100.0) )
    threshold_vals_dic = {}
    threshold_vals = []
    num_classes = len(class_list)
    for i in range(num_classes):
        z = [ predicted_probs[j,i] for j in range(len(sets["test"]))]
        z.sort(reverse=True)
        threshold_vals_dic[class_list[i]]= z[threshold_index]
        threshold_vals.append(z[threshold_index])
    print(threshold_vals_dic)


    print_predictions(sets["test"],predicted_tags_readable,class_list, class_probablities=predicted_probs,threshold_vals=threshold_vals)
class label_preprocess:
    def __init__(self, list_):
        self.mlb = MultiLabelBinarizer()
        self.mlb.fit(list_)

    def encode(self, list_):
        return (list(self.mlb.transform([list_])[0]))

    def decode(self, list_):
        buf = self.mlb.inverse_transform(
            np.array(list_).reshape(1, len(self.mlb.classes_)))[0]
        return (buf)
Exemplo n.º 23
0
class KaggleAmazonDataset(Dataset):
    def __init__(self, csv_path, img_path, img_ext, transform=None):

        tmp_df = pd.read_csv(csv_path)

        self.mlb = MultiLabelBinarizer()
        self.img_path = img_path
        self.img_ext = img_ext
        self.transform = transform

        # Extracts the data and the images
        self.X_train = tmp_df['image_name']
        self.y_train = self.mlb.fit_transform(
            tmp_df['tags'].str.split()).astype(np.float32)

    def __getitem__(self, index):
        img = Image.open(self.img_path + self.X_train[index] + self.img_ext)
        img = img.convert('RGB')
        if self.transform is not None:
            img = self.transform(img)

        label = torch.from_numpy(self.y_train[index])
        return img, label

    def name(self):
        return self.X_train

    def __len__(self):
        return len(self.X_train.index)

    def splits(self, valx, valy):
        self.X_train = pd.Series(self.X_train.tolist()[valx:valy])
        self.y_train = self.y_train[valx:valy]

    def getLabelEncoder(self):
        return self.mlb

    def numClasses(self):
        return self.y_train.shape[1]

    def classesName(self):
        return self.mlb.inverse_transform(np.array([[1] * 17]))

    ## newly added
    def set_transformation(self):
        num_of_transf = randint(1, len(TRANSFORMATIONS))
        rand_transf = random.sample(TRANSFORMATIONS, k=num_of_transf)
        rand_transf.extend([
            transforms.ToTensor(),
            transforms.Normalize([0.311, 0.340, 0.299], [0.167, 0.144, 0.138])
        ])
        self.transform = transforms.Compose(rand_transf)
def test_training():
    trip_data = pickle.load(open("save.p", "rb"))
    models = list()
    if (len(trip_data) > 5):
        mlb = MultiLabelBinarizer()
        y_raw = trip_data["tag_array"]
        mlb.fit(y_raw)
        y = mlb.transform(y_raw)
        X = trip_data[[
            'distance', 'start_long', 'start_lat', 'end_long', 'end_lat',
            'start_hour', 'end_hour', 'vehicleid', 'sample_weight',
            'vehicle_engine_capacity', 'vehicle_year'
        ]]

        num_tagged_trips = len(y[y])
        # split data into train and test sets
        seed = 7
        test_size = 0.33

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=seed)

        print("Y_train")
        print(y_train)
        estimator = CatBoostClassifier(iterations=10,
                                       random_state=1,
                                       allow_const_label=True)
        model = OneVsRestClassifier(estimator=estimator)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_pred_transformed = mlb.inverse_transform(y_pred)
        y_test_transformed = mlb.inverse_transform(y_test)
        print("y_pred")
        print(y_pred_transformed)
        print("y_test")
        print(y_test_transformed)
        predictions = [(value) for value in y_pred]
        print("predictions")
        print(predictions)
        accuracy = accuracy_score(y_test, predictions)
        print(f"accuracy {accuracy}")

        ACCURACY_THRESHOLD = 0.85
        if (accuracy > ACCURACY_THRESHOLD):
            models.append({
                "model": model,
                "tag_id": tag_id,
                "accuracy": accuracy
            })

    return models
Exemplo n.º 25
0
def get_scores(clf, train_tags, train_labels, test_tags, test_labels,
               binarize=False, store_true=False):
    """ Gets two lists of changeset ids, does training+testing """
    if binarize:
        binarizer = MultiLabelBinarizer()
        clf.fit(train_tags, binarizer.fit_transform(train_labels))
        preds = binarizer.inverse_transform(clf.predict(test_labels))
    else:
        logging.info("Fitting model:")
        clf.fit(train_tags, train_labels) # train model
        logging.info("Generating predictions:")
        preds = clf.predict(test_tags) # predict labels for test set
    return copy.deepcopy(test_labels), preds
Exemplo n.º 26
0
def get_scores(clf, X_train, y_train, X_test, y_test,
               binarize=False, human_check=False, store_true=False):
    """ Gets two lists of changeset ids, does training+testing
        returns true and predicted labels
    """
    if binarize:
        binarizer = MultiLabelBinarizer()
        clf.fit(X_train, binarizer.fit_transform(y_train))
        preds = binarizer.inverse_transform(clf.predict(X_test))
    else:
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        if store_true:
            labels = clf.transform_labels(y_test)
            with open('/home/centos/sets/true_labels.txt', 'w') as f:
                for label in labels:
                    f.write(str(label) + '\n')
            #logging.info("Wrote true labels to ~/sets/true_labels.txt")
    hits = misses = predictions = 0
    #if LABEL_DICT.exists():
    #    with LABEL_DICT.open('rb') as f:
    #        pred_label_dict = pickle.load(f)
    #else:
    pred_label_dict = {}
    for pred, label in zip(preds, y_test):
        if human_check:
            while (pred, label) not in pred_label_dict:
                print("Does '{}' match the label '{}'? [Y/n]".format(pred, label))
                answer = input().lower()
                if answer == 'y':
                    pred_label_dict[(pred, label)] = True
                elif answer == 'n':
                    pred_label_dict[(pred, label)] = False
                else:
                    print("Please try again")
            with LABEL_DICT.open('wb') as f:
                pickle.dump(pred_label_dict, f)
            if pred_label_dict[(pred, label)]:
                hits += 1
            else:
                misses += 1
        else:
            if pred == label:
                hits += 1
            else:
                misses += 1
        predictions += 1
    #logging.info("Preds:" + str(predictions))
    #logging.info("Hits:" + str(hits))
    #logging.info("Misses:" + str(misses))
    return copy.deepcopy(y_test), preds
    def classify(self, features):
        model = pickle.load(open("classification\\numerical\\random_forest\\model\\model.pickle", 'rb'))
        result = model.predict(features)

        # todo put this functionality into the common classifier template
        MultiLabelBinarizer.set_params(range(0, 16))
        mlb = MultiLabelBinarizer()
        mlb.fit([range(0, 16)])
        genre_predictions_categorized = mlb.inverse_transform(result)

        if len(genre_predictions_categorized) == 0 or not all(genre_predictions_categorized):
            return ["Unclassifiable"]

        genre_predictions_categorized = [x[0] for x in mlb.inverse_transform(result)]  # this needs checkinf for which value o fthe tuple is the actual value

        genre_predictions = []
        lm = LabelManipulator()
        for label in genre_predictions_categorized:
            genre_predictions.append(lm.uncategorise_genre(label))

        # convert the ids to names

        return genre_predictions
Exemplo n.º 28
0
def get_data(directory, metadata, index_name):
    with open(f"{directory_path}/config.json", "r") as f:
        config = json.load(f)
    with tf.Session() as sess:
        iterator = BigEarthNet(f"{directory}/record.tfrecord",
                               config["batch_size"], 1, 0,
                               config["label_type"]).batch_iterator
        iterator_ins = iterator.get_next()

        model = importlib.import_module("models." +
                                        config["model_name"]).DNN_model(
                                            config["label_type"])
        model.create_network()

        variables_to_restore = tf.global_variables()
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        model_saver = tf.train.Saver(max_to_keep=0,
                                     var_list=variables_to_restore)
        model_file = environment["model_weights"]
        model_saver.restore(sess, model_file)

        graph = tf.get_default_graph()
        prediction = graph.get_tensor_by_name("Cast:0")

        mlb = MultiLabelBinarizer(config["labels"])
        mlb.fit(config["labels"])

        num_patches = len(glob.glob(f"{directory}/patches/*"))
        for batch_number in range(math.ceil(num_patches /
                                            config["batch_size"])):
            try:
                batch_dict = sess.run(iterator_ins)
                sess_res = sess.run([prediction],
                                    feed_dict=model.feed_dict(batch_dict))
                results = mlb.inverse_transform(sess_res[0])
            except tf.errors.OutOfRangeError:
                pass

            for index, patch in enumerate(batch_dict["patch_name"].values):
                if results[index]:
                    data = {}
                    data.update(metadata)
                    data["labels"] = results[index]
                    data["location"] = patch_location(directory,
                                                      patch.decode("utf-8"))

                    yield {"_index": "fyp-patches", "_source": data}
Exemplo n.º 29
0
 def test_multilabelencoder(implementation):
     name = 'testmulilabelencoder_me'
     valid_me = MultiLabelBinarizer()
     valid_me.fit([('a', 'b'), ('c', )])
     implementation.save(valid_me, name)
     test_me = implementation.load(name)
     got = test_me.transform([('a', )])
     expected = valid_me.transform([('a', )])
     assert_array_equal(got, expected)
     # test inverse transform
     print(expected)
     inverse_expected = valid_me.inverse_transform(expected)
     print(got)
     inverse_got = test_me.inverse_transform(got)
     assert_array_equal(inverse_got, inverse_expected)
class TagPredictor:
    classifier = None
    model = None
    corpus = None

    def __init__(self, classifier, corpus):
        self.classifier = classifier
        self.corpus = corpus

        np.random.seed(500)

        print("Initialized TagPredictor")

    def train(self):
        print("Started training")

        # Transform tags to multilabel format
        self.mlb = MultiLabelBinarizer()
        Y_matrix = self.mlb.fit_transform(self.corpus['Tags'])
        #np.set_printoptions(threshold=np.inf)
        #print(matrix[0])
        print(self.mlb.classes_)
        train, test, Train_Y, Test_Y = train_test_split(self.corpus,
                                                        Y_matrix,
                                                        test_size=0.3,
                                                        shuffle=True)
        Train_X = train['Bag_of_Words']
        Test_X = test['Bag_of_Words']
        #print(Train_X)
        #print(Train_Y)

        self.Tfidf_vect = TfidfVectorizer(max_features=5000)
        self.Tfidf_vect.fit(self.corpus['Bag_of_Words'])
        Train_X_Tfidf = self.Tfidf_vect.transform(Train_X)
        Test_X_Tfidf = self.Tfidf_vect.transform(Test_X)

        self.model = self.classifier()
        self.model.train(Train_X_Tfidf, Train_Y)

        print("Finished training")

    def predict(self, df):
        # return predictions_df, confidence_level
        X = df
        X_Tfidf = self.Tfidf_vect.transform(X)
        matrix, confidenceList = self.model.predict(X_Tfidf)
        labels = self.mlb.inverse_transform(matrix)
        return labels, confidenceList
Exemplo n.º 31
0
def checkTweet(text) :
"""Function to  find the gender based on the tweet"""

    labels=[]
    train=[]
    des=[]
    file = Path("gendertext.pickle")
    if(!file.exists()) :
        # Retrieve  text and labels for training
        with open("gender-classifier-DFE-791531.csv",encoding="latin-1") as f:
                    for row in DictReader(f):
                        label= row["gender"]
                        labels.append([label])
                        train.append(row["text"])
        clean_text = []
        for i in range( 0, len(train)):
                clean_text.append(" ".join(SentenceTokeniser.review_to_wordlist(train[i], True)))
                print(i)
        mlb = MultiLabelBinarizer()
        Y = mlb.fit_transform(labels)
        with open('mlb.pickle', 'wb') as f:  # Python 3: open(..., 'wb')
               pickle.dump(mlb, f)

        classifier = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('clf', OneVsRestClassifier(LinearSVC()))])
        # Fit the data using the classifier
        classifier.fit(clean_text, Y)
        #  Save classifer as pickle file
        with open('gendertext.pickle', 'wb') as f:  # Python 3: open(..., 'wb')
               pickle.dump(classifier, f)

    des.append(" ".join(SentenceTokeniser.review_to_wordlist(text, True)))

    # Load  classifer saved  as pickle file 
    with open('gendertext.pickle','rb') as f:  # Python 3: open(..., 'rb')
           classifier = pickle.load(f)
    # Predict class
    predicted = classifier.predict(des)
    with open('mlb.pickle','rb') as f:  # Python 3: open(..., 'rb')
           mlb = pickle.load(f)
    all_labels = mlb.inverse_transform(predicted)
    try :
       return(all_labels[0][0])
    except :
       return "None"
Exemplo n.º 32
0
def testClassifiers(X_train, y_train, X_test, y_test, multilabel):
    mlb = None
    if multilabel:
        mlb = MultiLabelBinarizer()
        mlb = mlb.fit(y_train)
        y_train = mlb.transform(y_train)
        y_test = mlb.transform(y_test)

    results = []
    for clf, clf_name in (
            #(RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
            #(Perceptron(n_iter=50), "Perceptron"),
            #(PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
            #(KNeighborsClassifier(n_neighbors=10), "kNN"),
            #(KNeighborsClassifier(n_neighbors=8, algorithm='brute', metric='cosine'), "kNN cosine"),
            #(RandomForestClassifier(n_estimators=100), "Random forest"),
            #(LinearSVC(penalty="l2", dual=False, tol=1e-3), "Linear SVC [l2]"),
            #(LinearSVC(penalty="l1", dual=False, tol=1e-3), "Linear SVC [l1]"),
            #(SGDClassifier(alpha=.0001, n_iter=50, penalty="l2"), "SGD Classifier [l2]"),
            #(SGDClassifier(alpha=.0001, n_iter=50, penalty="l1"), "SGD Classifier [l1]"),
        (SGDClassifier(alpha=.001, n_iter=50,
                       penalty="elasticnet"), "SGD Classifier [elasticnet]"),
            #(NearestCentroid(), "Nearest Centroid"), #not suitable for multilabel
            #(Pipeline([
            #    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))),
            #    ('classification', LinearSVC(penalty="l2"))]), "Linear SVC [l1 based features]")
    ):
        #print('=' * 80)
        #print(name)
        if multilabel:
            clf = OneVsRestClassifier(clf)
        results.append(
            benchmark(clf, clf_name, X_train, y_train, X_test, y_test,
                      multilabel))

    indices = np.arange(len(results))

    results = [[x[i] for x in results] for i in xrange(len(results[0]))]
    if multilabel:
        preds = results[2]
        for i, pred in enumerate(preds):
            preds[i] = mlb.inverse_transform(pred)

    return results
    """
Exemplo n.º 33
0
class ACMClassificator(BaseACMClassificator):
    def __init__(self):
        self.vectorizer = CountVectorizer(min_df=0.05, max_df=0.45, tokenizer=tokenize)
        self.mlb = MultiLabelBinarizer()
        self.classificator = OneVsRestClassifier(SVC(), n_jobs=-1)

    def _prepare_problems(self, problems):
        return self.vectorizer.transform([p.statement for p in problems])

    def fit(self, problems, tags):
        nltk.download('punkt', quiet=True)
        self.vectorizer.fit([p.statement for p in problems])
        mat = self._prepare_problems(problems)
        self.mlb = self.mlb.fit(tags)
        self.classificator.fit(mat.toarray(), self.mlb.transform(tags))

    def predict(self, problems):
        mat = self._prepare_problems(problems)
        predicted = self.classificator.predict(mat.toarray())
        return self.mlb.inverse_transform(predicted)
Exemplo n.º 34
0
def get_classify():
    X_train, Y_train = load_data()

    # 定义分类器
    classifier = Pipeline([
        ('counter', CountVectorizer(tokenizer=jieba_tokenizer)),  # 标记和计数,提取特征用 向量化
        ('tfidf', TfidfTransformer()),                            # IF-IDF 权重
        ('clf', OneVsRestClassifier(LinearSVC())),                # 1-rest 多分类(多标签)
    ])
    mlb = MultiLabelBinarizer()
    Y_train = mlb.fit_transform(Y_train)                          # 分类号数值化

    classifier.fit(X_train, Y_train)

    # X_test = ["数据分析"]
    # 把所有的测试文本存到一个list中
    test_list = []
    test_name = []
    filelist2 = os.listdir(base_path + "data_test/")
    for files in filelist2:
        # print (files)
        test_name.append(files)
        f = open(base_path + "data_test/" + files, 'r')
        test_list.append(f.read())

    prediction = classifier.predict(test_list)
    result = mlb.inverse_transform(prediction)

    f = open('result2.txt', 'w')
    for i in range(len(test_name)):
        f.write(str(test_name[i]) + '   ' + str(result[i]) + '\n')

    print (result, len(result))
    num_dict = Counter(result)
    print (len(num_dict))
    print ((num_dict[('1',)] + num_dict[('2',)] + num_dict[('3',)]) / float(len(result)))  # 整数除整数为0,应把其中一个改为浮点数。
Exemplo n.º 35
0
import pandas as pd

data_root = "/Users/erdicalli/dev/workspace/yelp/submission/submissions/"

mlb = MultiLabelBinarizer()
total_labels = list()
for idx, file in enumerate(output_file_names):
    f = pd.read_csv(data_root + "merged_" + output_file_names[idx] + ".csv")
    labels = np.array([list(y.replace(" ", "")) for y in f["labels"]])
    total_labels.append(mlb.fit_transform(labels))

result_labels = np.ndarray(shape=(10000, 9))

for label_id, algorithm in enumerate(combination):
    result_labels[:, label_id] = total_labels[algorithm][:, label_id]

labels = mlb.inverse_transform(result_labels)

test_data_frame = pd.read_csv(data_root + "merged_" + output_file_names[4] + ".csv")
df = pd.DataFrame(columns=['business_id', 'labels'])

for i in range(len(test_data_frame)):
    biz = test_data_frame.loc[i]['business_id']
    label = labels[i]
    label = str(label)[1:-1].replace(",", " ")
    df.loc[i] = [str(biz), label]

with open(data_root + "combined_results.csv", 'w') as f:
    df.to_csv(f, index=False)
Exemplo n.º 36
0
y_map_cate = ml_cate.fit_transform(y_cate)
y_map_cate = np.array(y_map_cate)


f_scores = []
for loop_stat in range(0,1):
    scores = []
    report_y_actual = []
    report_y_predict = []
    kf = cross_validation.KFold(tfidf_train.shape[0], n_folds=5, shuffle=True)
    loop = 0
    for train_index, test_index in kf:
        x_train, x_test = tfidf_train[train_index].toarray(), tfidf_train[test_index].toarray()
        y_train_cate_map, y_test_cate_map = y_map_cate[train_index], y_map_cate[test_index]
        y_train_code_map,y_test_code_map = y_map[train_index], y_map[test_index]
        y_train_code, y_test_code = np.array(ml.inverse_transform(y_train_code_map)),np.array(ml.inverse_transform(y_test_code_map))
        y_train_cate,y_test_cate = np.array(ml_cate.inverse_transform(y_train_cate_map)),np.array(ml_cate.inverse_transform(y_test_cate_map))
        # classify the category
        model_cate = OneVsRestClassifier(LogisticRegression())
        model_cate.fit(x_train, y_train_cate_map)
        y_predict_cate_map = model_cate.predict(x_test)
        y_predict_cate = np.array(ml_cate.inverse_transform(y_predict_cate_map))
        y_predict_cate_unique = reduce(lambda a,b:set(a)|set(b)  ,y_predict_cate)
        for cate_cur in y_predict_cate_unique:
            if cate_cur not in defaultcode:
                y_text_new,y_predict_new = transfer_multilabel(y_predict_cate_map,y_test_cate_map,ml_cate,None,"0")
                report_y_predict.extend(y_predict_new)
                report_y_actual.extend(y_text_new)
            else:
                continue
                idx_test_cur = [ind for ind in range(0,len(y_predict_cate)) if cate_cur in y_predict_cate[ind]]
Exemplo n.º 37
0
def run(options):
    DATA_PATHS = json.load(options.key_file)
    VERBOSE = options.verbose
    persister = Persister(DATA_PATHS, options)
    if options.persist and persister.is_saved():
        X, Y, tr = persister.read()
        if VERBOSE: print("Y = " + str(Y.shape))
    else:
        # --- LOAD DATA ---
        X_raw, Y_raw, tr = load_dataset(DATA_PATHS, options.data_key, options.fulltext)
        if options.toy_size < 1:
            if VERBOSE: print("Just toying with %d%% of the data." % (options.toy_size * 100))
            zipped = list(zip(X_raw, Y_raw))
            random.shuffle(zipped)
            X_raw, Y_raw = zip(*zipped)
            toy_slice = int(options.toy_size * len(X_raw))
            X_raw, Y_raw = X_raw[:toy_slice], Y_raw[:toy_slice]

        if options.verbose: print("Binarizing labels...")
        mlb = MultiLabelBinarizer(sparse_output=True, classes=[i[1] for i in sorted(
            tr.index_nodename.items())] if options.hierarch_f1 else None)
        Y = mlb.fit_transform(Y_raw)
        if VERBOSE: print("Y = " + str(Y.shape))

        # --- EXTRACT FEATURES ---
        input_format = 'filename' if options.fulltext else 'content'
        concept_analyzer = SynsetAnalyzer().analyze if options.synsets \
            else ConceptAnalyzer(tr.thesaurus, input=input_format, persist=options.persist and options.concepts,
                persist_dir=options.persist_to, repersist=options.repersist,
                file_path=DATA_PATHS[options.data_key]['X']).analyze
        terms = CountVectorizer(input=input_format, stop_words='english', binary=options.binary,
                                token_pattern=word_regexp)
        concepts = CountVectorizer(input=input_format, analyzer=concept_analyzer, binary=options.binary,
                                   vocabulary=tr.nodename_index if not options.synsets else None)

        if options.hierarchical:
            hierarchy = tr.nx_graph
            if options.prune_tree:
                if VERBOSE: print("[Pruning] Asserting tree hierarchy...")
                old_edge_count = hierarchy.number_of_edges()
                hierarchy = nx.bfs_tree(hierarchy, tr.nx_root)
                pruned = old_edge_count - hierarchy.number_of_edges()
                if VERBOSE: print("[Pruning] Pruned %d of %d edges (%.2f) to assert a tree hierarchy" % (pruned, old_edge_count, pruned/old_edge_count))

            if options.hierarchical == "bell":
                activation = SpreadingActivation(hierarchy, decay=1, weighting="bell", root=tr.nx_root)
            elif options.hierarchical == "belllog":
                activation = SpreadingActivation(hierarchy, decay=1, weighting="belllog", root=tr.nx_root)
            elif options.hierarchical == "children":
                # weights are already initialized with 1/out_degree, so use basic SA with decay 1
                activation = SpreadingActivation(hierarchy, decay=1, weighting="children")
            elif options.hierarchical == "binary":
                activation = BinarySA(hierarchy)
            elif options.hierarchical == "onehop":
                activation = OneHopActivation(hierarchy, verbose=VERBOSE)
            else:
                #  basic
                activation = SpreadingActivation(tr.nx_graph, firing_threshold=1.0, decay=0.25, weighting=None)
            concepts = make_pipeline(concepts, activation)

        if options.graph_scoring_method:
            extractor = GraphVectorizer(method=options.graph_scoring_method, analyzer=concept_analyzer
            if options.concepts else NltkNormalizer().split_and_normalize)
        elif options.terms and (options.concepts or options.synsets):
            extractor = FeatureUnion([("terms", terms), ("concepts", concepts)])
        elif options.terms:
            extractor = terms
        else:
            extractor = concepts

        if VERBOSE: print("Extracting features...")
        if VERBOSE > 1: start_ef = default_timer()
        X = extractor.fit_transform(X_raw)
        if VERBOSE > 1: print(default_timer() - start_ef)
        if options.persist:
            persister.persist(X, Y, tr)

    if VERBOSE:
        print("X = " + repr(X))
        print("Vocabulary size: {}".format(X.shape[1]))
        print("Number of documents: {}".format(X.shape[0]))
        print("Mean distinct words per document: {}".format(X.count_nonzero() /
                                                    X.shape[0]))
        words = X.sum(axis=1)
        print("Mean word count per document: {} ({})".format(words.mean(), words.std()))

    if VERBOSE > 1:
        X_tmp = X.todense()
        # drop samples without any features...
        X_tmp = X_tmp[np.unique(np.nonzero(X_tmp)[0])]
        print("[entropy] Dropped {} samples with all zeroes?!".format(X.shape[0] - X_tmp.shape[0]))
        X_tmp = X_tmp.T # transpose to compute entropy per sample
        h = entropy(X_tmp)
        print("[entropy] shape:", h.shape)
        print("[entropy] mean entropy per sample {} ({})".format(h.mean(), h.std()))
        # print("Mean entropy (base {}): {}".format(X_dense.shape[0], entropy(X_dense, base=X_dense.shape[0]).mean()))
        # print("Mean entropy (base e): {}".format(entropy(X_dense).mean()))
    # _, _, values = sp.find(X)
    # print("Mean value: %.2f (+/- %.2f) " % (values.mean(), 2 * values.std()))


    # n_iter = np.ceil(10**6 / (X.shape[0] * 0.9))
    # print("Dynamic n_iter = %d" % n_iter)


    if options.interactive:
        print("Please wait...")
        clf = create_classifier(options, Y.shape[1])  # --- INTERACTIVE MODE ---
        clf.fit(X, Y)
        thesaurus = tr.thesaurus
        print("Ready.")
        try:
            for line in sys.stdin:
                x = extractor.transform([line])
                y = clf.predict(x)
                desc_ids = mlb.inverse_transform(y)[0]
                labels = [thesaurus[desc_id]['prefLabel'] for desc_id in desc_ids]
                print(*labels)
        except KeyboardInterrupt:
            exit(1)
        exit(0)

    if VERBOSE: print("Performing %d-fold cross-validation..." % (options.folds if options.cross_validation else 1))

    if options.plot:
        all_f1s = []

    # --- CROSS-VALIDATION ---
    scores = defaultdict(list)
    if options.cross_validation:
        kf = model_selection.KFold(X.shape[0], n_folds=options.folds, shuffle=True)
    else:
        kf = ShuffleSplit(X.shape[0], test_size=options.test_size, n_iter=1)
    for train, test in kf:
        if VERBOSE: print("=" * 80)
        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]

        # mlp doesn't seem to like being stuck into a new process...
        if options.debug or options.clf_key in {'mlp', 'mlpthr'}:
            Y_pred, Y_train_pred = fit_predict(X_test, X_train, Y_train, options, tr)
        else:
            Y_pred, Y_train_pred = fit_predict_new_process(X_test, X_train, Y_train, options, tr)

        if options.training_error:
            scores['train_f1_samples'].append(f1_score(Y_train, Y_train_pred, average='samples'))

        scores['avg_n_labels_pred'].append(np.mean(Y_pred.getnnz(1)))
        scores['avg_n_labels_gold'].append(np.mean(Y_test.getnnz(1)))
        scores['f1_samples'].append(f1_score(Y_test, Y_pred, average='samples'))
        scores['p_samples'].append(precision_score(Y_test, Y_pred, average='samples'))
        scores['r_samples'].append(recall_score(Y_test, Y_pred, average='samples'))
        scores['f1_micro'].append(f1_score(Y_test, Y_pred, average='micro'))
        scores['p_micro'].append(precision_score(Y_test, Y_pred, average='micro'))
        scores['r_micro'].append(recall_score(Y_test, Y_pred, average='micro'))
        scores['f1_macro'].append(f1_score(Y_test, Y_pred, average='macro'))
        scores['p_macro'].append(precision_score(Y_test, Y_pred, average='macro'))
        scores['r_macro'].append(recall_score(Y_test, Y_pred, average='macro'))
        if options.plot:
            all_f1s.append(f1_per_sample(Y_test, Y_pred))

        if options.worst:
            f1s = f1_per_sample(Y_test, Y_pred)
            predicted_labels = [[tr.thesaurus[l]['prefLabel'] for l in y] for y in mlb.inverse_transform(Y_pred)]
            f1s_ids = sorted(zip(f1s, [X_raw[i] for i in test],
                                 [[tr.thesaurus[l]['prefLabel'] for l in Y_raw[i]] for i in test], predicted_labels))
            pprint(f1s_ids[:options.worst])

        if options.hierarch_f1:
            scores['hierarchical_f_score'].append(
                hierarchical_f_measure(tr, Y_test, Y_pred))

        if options.cross_validation and VERBOSE:
            print(' <> '.join(["%s : %0.3f" % (key, values[-1]) for key, values in sorted(scores.items())]))
            # if options.lsa:
            #     if VERBOSE: print("Variance explained by SVD:", svd.explained_variance_ratio_.sum())

    if VERBOSE: print("=" * 80)

    results = {key: (np.array(values).mean(), np.array(values).std()) for key, values in scores.items()}

    print(' <> '.join(["%s: %0.3f (+/- %0.3f)" % (key, mean, std) for key, (mean, std) in sorted(results.items())]))

    if options.output_file:
        write_to_csv(results, options)

    if options.plot:
        Y_f1 = np.hstack(all_f1s)
        Y_f1.sort()
        if VERBOSE:
            print("Y_f1.shape:", Y_f1.shape, file=sys.stderr)
            print("Saving f1 per document as txt numpy to", options.plot)
        np.savetxt(options.plot, Y_f1)

    return results
Exemplo n.º 38
0
printF1scores()

t = time.time()

binarizer = MultiLabelBinarizer()
#labels list is converted to binary matrix
y_train= binarizer.fit_transform(y_train) 

random_state = np.random.RandomState(0)
svmclassifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
svmclassifier.fit(X_train, y_train)

y_predict = svmclassifier.predict(X_test)

#Binary matrix is converted to labels
y_predict_label = binarizer.inverse_transform(y_predict) 

print "Elaspsed Time: ", "{0:.1f}".format(time.time()-t), "sec"

tdf  = pd.read_csv(path_to_data+"test_biz_fc8features.csv")
df = pd.DataFrame(columns=['business_id','labels'])

for i in range(len(tdf)):
    biz = tdf.loc[i]['business']
    label = y_predict_label[i]
    label = str(label)[1:-1].replace(",", " ")
    df.loc[i] = [str(biz), label]

with open(path_to_data+"submission_fc8.csv",'w') as file67:
    df.to_csv(file67, index=False)   
Exemplo n.º 39
0
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(stories)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(train_counts)
X_train_tfidf = tfidf_transformer.transform(train_counts)

#format tags
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
tag_list = preprocess_tags(tags)
processed_tags = mlb.fit_transform(tag_list)
print(processed_tags)
#train the classifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
clf = OneVsRestClassifier(MultinomialNB())#MultinomialNB()
clf.fit(X_train_tfidf,processed_tags)

test_docs = ["funny funny joke", "died sad joke tragedy funny", "lasers and robots"]
X_test_counts = count_vect.transform(test_docs,)
print("X_test_counts.shape")
print(X_test_counts.shape)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

predicted = clf.predict(X_test_tfidf)
print(predicted)
print(mlb.inverse_transform(predicted))
Exemplo n.º 40
0
##################


# In[ ]:

classifier.fit(train_business_feature, y_ptrain_mlb)

test_business_feature = pd.read_csv(data_root+'test_business_feature'+cluster +'.csv')
business_id = test_business_feature['business_id'].reshape(-1,1)
test_business_feature.drop('business_id', axis=1, inplace=True)
y_predict_test = classifier.predict(test_business_feature)


# In[ ]:

y_predict_label = mlb.inverse_transform(y_predict_test)

df = pd.DataFrame(columns=['business_id','labels'])

for i in range(len(y_predict_label)):
    biz = business_id[i][0]
    label = y_predict_label[i]
    label = str(label)[1:-1].replace(",", " ")
    df.loc[i] = [str(biz), label]

with open(data_root+"sub_pca300.csv",'w') as f:
    df.to_csv(f, index=False) 



X_train_scaled_Concat = np.hstack((X_train_scaled,X_train_scaled_Res))
X_test_scaled = preprocessing.normalize(X_test, norm='l2')
X_test_scaled_Res = preprocessing.normalize(X_test_Res, norm='l2')
X_test_scaled_Concat = np.hstack((X_test_scaled,X_test_scaled_Res))

mlb = MultiLabelBinarizer()
y_train= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

random_state = np.random.RandomState(0)
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
classifier.fit(X_train_scaled_Concat, y_train)

y_predict = classifier.predict(X_test_scaled_Concat)

#print list(mlb.classes_)
y_predict_label = mlb.inverse_transform(y_predict) #Convert binary matrix back to labels

print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

test_data_frame  = pd.read_csv(data_root+"test_biz_fc7features.csv") #fc7features and fc1000features have same business names
df = pd.DataFrame(columns=['business_id','labels'])

for i in range(len(test_data_frame)):
    biz = test_data_frame.loc[i]['business']
    label = y_predict_label[i]
    label = str(label)[1:-1].replace(",", " ")
    df.loc[i] = [str(biz), label]

with open(data_root+"submission_fc7_fc1000_norm.csv",'w') as f:
    df.to_csv(f, index=False)
X_ptrain, X_ptest, y_ptrain, y_ptest = train_test_split(X_train, y_ptrain, test_size=.2, random_state=random_state)

print("About to start training classifier with set parameters on subset of train data")
classifier = OneVsRestClassifier(GradientBoostingClassifier(learning_rate=0.01, n_estimators=5000, subsample=0.5,
                                                            min_samples_split=175, min_samples_leaf=10, max_depth=5,
                                                            max_features='sqrt',
                                                            verbose=1,
                                                            random_state=SEED))
classifier.fit(X_ptrain, y_ptrain)

print("About to make predictions on sample of training data")
y_ppredict = classifier.predict(X_ptest)

print("Time passed: {0:.1f} sec".format(time.time() - t))
print("Samples of predicted labels (in binary matrix):\n{}".format(y_ppredict[0:3]))
print("\nSamples of predicted labels:\n", mlb.inverse_transform(y_ppredict[0:3]))
statistics = pd.DataFrame(columns=["attribute " + str(i) for i in range(9)] + ['num_biz'],
                          index=["biz count", "biz ratio"])
pd.options.display.float_format = '{:.0f}%'.format
print(statistics)
print("F1 score: {}".format(f1_score(y_ptest, y_ppredict, average='micro')))
print("Individual Class F1 score: {}".format(f1_score(y_ptest, y_ppredict, average=None)))


# Re-Train classifier using all training data, and make predictions on test set
t = time.time()

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)  # Convert list of labels to binary matrix

print("About to train classifier on all training data (to have it ready to predict on submission test data)")
import time
t=time.time()

mlb = MultiLabelBinarizer()
y_ptrain= mlb.fit_transform(y_train)  #Convert list of labels to binary matrix

random_state = np.random.RandomState(0)
X_ptrain, X_ptest, y_ptrain, y_ptest = train_test_split(X_train, y_ptrain, test_size=.2,random_state=random_state)
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True)) #F1 score:  0.803711220644
#classifier = OneVsOneClassifier(svm.SVC(kernel='linear', probability=True))
#classifier = OutputCodeClassifier(svm.SVC(kernel='linear', probability=True))
classifier.fit(X_ptrain, y_ptrain)

y_ppredict = classifier.predict(X_ptest)

print "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

print "Samples of predicted labels (in binary matrix):\n", y_ppredict[0:3]
print "\nSamples of predicted labels:\n", mlb.inverse_transform(y_ppredict[0:3])


statistics = pd.DataFrame(columns=[ "attribuite "+str(i) for i in range(9)]+['num_biz'], index = ["biz count", "biz ratio"])
statistics.loc["biz count"] = np.append(np.sum(y_ppredict, axis=0), len(y_ppredict))
pd.options.display.float_format = '{:.0f}%'.format
statistics.loc["biz ratio"] = statistics.loc["biz count"]*100/len(y_ppredict)
statistics

from sklearn.metrics import f1_score

print "F1 score: ", f1_score(y_ptest, y_ppredict, average='micro')
print "Individual Class F1 score: ", f1_score(y_ptest, y_ppredict, average=None)
Exemplo n.º 44
0
print "Calculating Predictions..."

files = ["xaa", "xab", "xac", "xad", "xae", "xaf"]
header = True
for chunk in files:
    t = time.time()
    print "chunk: " + chunk
    test_df = pd.read_csv(data_root + chunk)
    # test_features = test_df['feature vector'].values
    test_features = np.array([convert_feature_to_vector(x) for x in test_df['feature vector']])
    test_features = normalize(np.append(normalize(test_features[:, :8192]), normalize(test_features[:, 8192:]), axis=1))
    reduced_test_features = model.transform(test_features)

    binarized_predicted_labels = classifier.predict(reduced_test_features)

    predicted_labels = mlb.inverse_transform(binarized_predicted_labels)

    print "Calculated Predictions... Time passed: ", "{0:.1f}".format(time.time() - t), "sec"
    print "Writing predictions to output file"
    test_data_frame = pd.read_csv(data_root + chunk)
    df = pd.DataFrame(columns=['business_id', 'labels'])

    for i in range(len(test_data_frame)):
        biz = test_data_frame.loc[i]['business']
        label = predicted_labels[i]
        label = str(label)[1:-1].replace(",", " ")
        df.loc[i] = [str(biz), label]

    if header:
        with open(submission_root + "reduced_" + output_file_name + ".csv", 'w') as f:
            df.to_csv(f, index=False, header=header)
#     ('tfidf', TfidfTransformer()),
#     ('to_dense', DenseTransformer()),
#     ('clf', OneVsRestClassifier(tree.DecisionTreeClassifier()))])


print '7th print'

gc.collect()
classifier.fit(X_train, Y)

print '8th print'

predicted = classifier.predict(X_test)
predicted_probability = classifier.predict_proba(X_test)

all_labels = mlb.inverse_transform(predicted)
results = classifier.predict_proba(X_test)[0]


# gets a dictionary of {'class_name': probability}
prob_per_class_dictionary = dict(zip(all_labels, results))

# gets a list of ['most_probable_class', 'second_most_probable_class', ..., 'least_class']
results_ordered_by_probability = map(lambda x: x[0], sorted(zip(all_labels, results), key=lambda x: x[1], reverse=True))

print results_ordered_by_probability

# for item, labels, probability in zip(X_test, all_labels,predicted_probability):
#     #print '%s => %s, %s' % (item, ', '.join(labels),str(probability))
#     output_file_object.write('%s => %s, %s' % (item, ', '.join(labels),str(probability))+'\n')