예제 #1
0
def model_one_with_all(df):
    classifier_params = {}
    for index, category in enumerate(collect_data.LABELS):
        classifier = df.copy()  #initial another pandas
        classifier.loc[classifier['label'] != category, 'label'] = 'Khac'
        size_polipatics_society = classifier[classifier['label'] ==
                                             category].shape[0]
        size_others = classifier[classifier['label'] == 'Khac'].shape[0]
        print('Number of politics-society documents: %s' %
              size_polipatics_society)
        print('Number of other documents: %s' % size_others)
        train_y = classifier['label']
        train_x = classifier['text']

        # split the dataset into training and test datasets

        #print(train_x[165], train_y[165])
        # label encode the target variable, encode labels to 0 or 1
        encoder = preprocessing.LabelEncoder()
        train_y = encoder.fit_transform(train_y)

        # word level tf-idf
        tfidf_vect = TfidfVectorizer(analyzer='word',
                                     token_pattern=r'\w{1,}',
                                     stop_words=collect_data.get_stop_word(),
                                     max_features=5000)
        tfidf_vect.fit(train_x)
        xtrain_tfidf = tfidf_vect.transform(train_x)

        # Getting transformed training and testing dataset
        print('Number of training documents: %s' % str(xtrain_tfidf.shape[0]))
        print('Number of features of each document: %s' %
              str(xtrain_tfidf.shape[1]))
        print('xtrain_tfidf shape: %s' % str(xtrain_tfidf.shape))
        print('train_y shape: %s' % str(train_y.shape))

        ### START CODE HERE ###
        train_y = np.expand_dims(train_y, axis=0)

        # for convenience in this exercise, we also use toarray() to convert
        # sparse to dense matrix
        xtrain_tfidf = xtrain_tfidf.T.toarray()
        ### END CODE HERE ###

        # New shape
        print('xtrain_tfidf shape: %s' % str(xtrain_tfidf.shape))
        print('train_y shape: %s' % str(train_y.shape))

        # return c
        d = functions.model_one_vs_all(xtrain_tfidf,
                                       train_y,
                                       num_iterations=3000,
                                       learning_rate=.5,
                                       print_cost=True)

        classifier_params[category] = {"w": d['w'], 'b': d['b']}

    return classifier_params
예제 #2
0
def multi_classifires():
    df = collect_data.readData(TRAIN_DATA, TRAINING_FILE, 1500)
    losses = []
    auc = []

    for category in collect_data.LABELS:
        classifier = df.copy()
        classifier.loc[classifier['label'] != category, 'label'] = 'Khac'
        train_x, test_x, train_y, test_y = model_selection.train_test_split(
            classifier['text'], classifier['label'])

        tfidf_vect = TfidfVectorizer(analyzer='word',
                                     token_pattern=r'\w{1,}',
                                     stop_words=collect_data.get_stop_word(),
                                     max_features=5000)
        tfidf_vect.fit(classifier['text'])
        xtrain_tfidf = tfidf_vect.transform(train_x)
        xtest_tfidf = tfidf_vect.transform(test_x)

        logistic_classifier = LogisticRegression(multi_class='ovr',
                                                 solver='sag',
                                                 C=10)

        cv_loss = np.mean(
            cross_val_score(logistic_classifier,
                            xtrain_tfidf,
                            train_y,
                            cv=5,
                            scoring='neg_log_loss'))
        losses.append(cv_loss)
        print('CV Log_loss score for class {} is {}'.format(category, cv_loss))

        cv_score = np.mean(
            cross_val_score(logistic_classifier,
                            xtrain_tfidf,
                            train_y,
                            cv=5,
                            scoring='accuracy'))
        print('CV Accuracy score for class {} is {}'.format(
            category, cv_score))

        logistic_classifier.fit(xtrain_tfidf, train_y)
        y_pred = logistic_classifier.predict(xtest_tfidf)
        y_pred_prob = logistic_classifier.predict_proba(xtest_tfidf)[:, 1]
        auc_score = metrics.roc_auc_score(test_y, y_pred_prob)
        auc.append(auc_score)
        print("CV ROC_AUC score {}\n".format(auc_score))

        print(confusion_matrix(test_y, y_pred))
        print(classification_report(test_y, y_pred))
    print('Total average CV Log_loss score is {}'.format(np.mean(losses)))
    print('Total average CV ROC_AUC score is {}'.format(np.mean(auc)))
예제 #3
0
def train_tunning():
    df = collect_data.readData(TRAIN_DATA, TRAINING_FILE, 1500)
    train_x, test_x, train_y, test_y = model_selection.train_test_split(
        df['text'], df['label'])
    if os.path.isfile('./data/model_train'):
        vec = open("./data/model_train", 'rb')  # rb= read in bytes
        grid3 = pickle.load(vec)
        vec.close()
    else:
        start_time = time.time()
        pipe = make_pipeline(
            TfidfVectorizer(analyzer='word',
                            token_pattern=r'\w{1,}',
                            stop_words=collect_data.get_stop_word()),
            OneVsRestClassifier(LogisticRegression()))
        param_grid = {
            'tfidfvectorizer__max_features': [5000, 10000],
            'onevsrestclassifier__estimator__solver': ['liblinear', 'sag'],
        }
        grid = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy')

        grid3 = grid.fit(train_x, train_y)

        end_time = time.time()
        print("total time", end_time - start_time)

        save_classifier = open("./data/model_train",
                               'wb')  #wb= write in bytes.
        pickle.dump(
            grid3, save_classifier
        )  #use pickle to dump the grid3 we trained, as 'Tfidf_LogR.pickle' in wb format
        save_classifier.close()

    print(grid3.best_estimator_.named_steps['onevsrestclassifier'])
    print(grid3.best_estimator_.named_steps['tfidfvectorizer'])

    grid3.best_params_
    grid3.best_score_
    predicted_y_test = grid3.predict(test_x)

    X_test_list = test_x.tolist()
    predicted_y_test_list = predicted_y_test.tolist()

    save = pd.DataFrame(np.column_stack([X_test_list, predicted_y_test_list]))
    save.to_csv("./data/result_trained.csv",
                sep=',',
                encoding='utf-16',
                header=True,
                index=False)
예제 #4
0
def clustering_word():
    data = collect_data.readData(TRAIN_DATA, TRAINING_FILE)
    tfidf_vect = TfidfVectorizer(
        analyzer='word',
        token_pattern=
        r'[a-zA-ZàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ0-9_]+',
        lowercase=True,
        ngram_range=(1, 4),
        stop_words=collect_data.get_stop_word(),
        max_features=10000)
    count_train = tfidf_vect.fit(data["text"])
    # bag_of_words = tfidf_vect.transform(data)
    # feature_names = np.array(count_train.get_feature_names())
    print(count_train.get_feature_names(),
          len(count_train.get_feature_names()))
예제 #5
0
def get_top(data, top=10):
    tfidf_vect = TfidfVectorizer(
        analyzer='word',
        token_pattern=
        r'[a-zA-ZàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ0-9_]+',
        lowercase=True,
        ngram_range=(1, 4),
        stop_words=collect_data.get_stop_word(),
        max_features=10000)
    count_train = tfidf_vect.fit(data)
    bag_of_words = tfidf_vect.transform(data)
    feature_names = np.array(count_train.get_feature_names())
    max_val = bag_of_words.max(axis=0).toarray().ravel()

    #sort weights from smallest to biggest and extract their indices
    sort_by_tfidf = max_val.argsort()
    return feature_names[sort_by_tfidf[-top:]]
예제 #6
0
def get_top(data, top=10):
    tfidf_vect = TfidfVectorizer(analyzer='word',
                                 token_pattern=r'[a-zA-Z0-9_]+',
                                 lowercase=True,
                                 max_df=0.05,
                                 stop_words=collect_data.get_stop_word(),
                                 max_features=5000)
    count_train = tfidf_vect.fit(data)
    bag_of_words = tfidf_vect.transform(data)
    feature_names = np.array(count_train.get_feature_names())
    print(feature_names)
    return
    max_val = bag_of_words.max(axis=0).toarray().ravel()

    #sort weights from smallest to biggest and extract their indices
    sort_by_tfidf = max_val.argsort()
    return feature_names[sort_by_tfidf[-top:]]
예제 #7
0
    rút danh_sách này xuống còn theo đúng quy_định của fifa và thời_hạn để ông làm_việc này
    là trước ngày ngoài cầu_thủ được dự_định sẽ đưa đến đức hlv beenhakker cũng đã quyết_định
    triệu_tập thêm cầu_thủ dự_bị và các cầu_thủ này sẽ được lựa_chọn nếu một trong số cầu_thủ
    chính_thức bất_ngờ bị chấn_thương danh_sách cầu_thủ của trinidad amp tobago thủ_môn
    kelvin_jack dundee shaka_hislop west_ham clayton_ince coventry_city hậu_vệ dennis_lawrence
    wrexham cyd_gray san_juan_jabloteh marvin_andrews rangers brent_sancho gillingham ian_cox
    gillingham atiba_charles w_connection avery_john new_england_revolution tiền_vệ
    silvio_spann unattached chris_birchall port_vale aurtis_whitley san_juan_jabloteh
    anthony_rougier united_petrotrin anthony_wolfe san_juan_jabloteh densill_theobald
    falkirk carlos_edwards luton dwight_yorke sydney_fc russell_latapy falkirk tiền_đạo
    stern_john coventry kenwyne_jones southampton collin_samuel dundee jason_scotland
    st_johnstone cornell_glen la_galaxy dự_bị brent_rahim jabloteh anton_pierre defence_force
    anthony_warner fulham nigel_henry kiruna_ff ricky_shakes swindon hector_sam port_vale
    scott_sealy kansas_wizards"""
    print(get_top([corpus]))
    # print(len(corpus))

    print(get_top_n_words([corpus]))

    # vectorizer = create_vectorizer([corpus])
    vectorizer = TfidfVectorizer(
        analyzer='word',
        token_pattern=
        r'[a-zA-ZàáãạảăắằẳẵặâấầẩẫậèéẹẻẽêềếểễệđìíĩỉịòóõọỏôốồổỗộơớờởỡợùúũụủưứừửữựỳỵỷỹýÀÁÃẠẢĂẮẰẲẴẶÂẤẦẨẪẬÈÉẸẺẼÊỀẾỂỄỆĐÌÍĨỈỊÒÓÕỌỎÔỐỒỔỖỘƠỚỜỞỠỢÙÚŨỤỦƯỨỪỬỮỰỲỴỶỸÝ0-9_]+',
        # max_df=0.05,
        # stop_words='english',
        encoding='utf-16',
        stop_words=collect_data.get_stop_word(),
        max_features=5000)
    tfidf_result = vectorizer.fit_transform([corpus])
    display_scores(vectorizer, tfidf_result)