Exemplo n.º 1
0
    def transform(tsm, sfm, fw_type, feature_weights = None):

        logging.debug(Logger.debug("FeatureWeight.transform() tsm: %d samples %d terms." % (tsm.get_total_samples(), tsm.get_total_terms())))

        if sfm is None:
            sfm = SampleFeatureMatrix()
            sfm.init_cagegories(tsm.get_categories())

        if fw_type == FeatureWeight.TFIDF:
            sfm = FeatureWeight.transform_tfidf(tsm, sfm, feature_weights)
        elif fw_type == FeatureWeight.TFRF:
            sfm = FeatureWeight.transform_tfrf(tsm, sfm, feature_weights)
        elif fw_type == FeatureWeight.TFIPNDF:
            sfm = FeatureWeight.transform_tfipndf(tsm, sfm, feature_weights)

        num_samples = sfm.get_num_samples()
        num_features = sfm.get_num_features()
        num_categories = sfm.get_num_categories()
        logging.debug(Logger.debug("FeatureWeight.transform(). sfm: %d samples %d terms %d categories." % (num_samples, num_features, num_categories)))

        return sfm
Exemplo n.º 2
0
def multicategories_predict(samples_test, model_name, result_dir):
    if model_name is None or len(model_name) == 0:
        logging.warn(Logger.warn("model_name must not be NULL."))
        return

    if result_dir is None:
        cfm_file = "%s.cfm" % (model_name)
        sfm_file = "%s.sfm" % (model_name)
    else:
        if not os.path.isdir(result_dir):
            try:
                os.mkdir(result_dir)
            except OSError:
                logging.error(Logger.error("mkdir %s failed." % (result_dir)))
                return
        cfm_file = "%s/%s.cfm" % (result_dir, model_name)
        sfm_file = "%s/%s.sfm" % (result_dir, model_name)

    logging.debug(Logger.error("Loading train sample feature matrix ..."))
    sfm_train = SampleFeatureMatrix()
    sfm_train.load(sfm_file)
    logging.debug(Logger.debug("Loading train category feature matrix ..."))
    cfm_train = CategoryFeatureMatrix()
    cfm_train.load(cfm_file)

    logging.debug(Logger.debug("Making sample feature matrix for test data ..."))
    category_id = 2000000
    sfm_test = SampleFeatureMatrix(sfm_train.get_category_id_map(), sfm_train.get_feature_id_map())

    features = cfm_train.get_features(category_id)

    for sample_id in samples_test.tsm.sample_matrix():
        (sample_category, sample_terms, term_map) = samples_test.tsm.get_sample_row(sample_id)

        category_1_id = Categories.get_category_1_id(sample_category)

        sfm_test.set_sample_category(sample_id, category_1_id)
        for feature_id in features:
            if feature_id in term_map:
                feature_weight = features[feature_id]
                sfm_test.add_sample_feature(sample_id, feature_id, feature_weight)

    logging.debug(Logger.debug("train sample feature matrix - features:%d categories:%d" % (sfm_train.get_num_features(), sfm_train.get_num_categories())))
    X_train, y_train = sfm_train.to_sklearn_data()

    logging.debug(Logger.debug("test sample feature matrix - features:%d categories:%d" % (sfm_test.get_num_features(), sfm_test.get_num_categories())))
    X_test, y_test = sfm_test.to_sklearn_data()

    clf = Classifier()

    logging.debug(Logger.debug("Classifier training ..."))
    clf.train(X_train, y_train)

    logging.debug(Logger.debug("Classifier predicting ..."))

    categories = samples_test.get_categories()

    categories_1_names = []

    categories_1_idx_map = {}
    categories_1_idlist = categories.get_categories_1_idlist()
    for category_id in categories_1_idlist:
        category_idx = sfm_test.get_category_idx(category_id)
        category_name = categories.get_category_name(category_id)
        categories_1_idx_map[category_idx] = (category_id, category_name)
    categories_1_idx_list = sorted_dict(categories_1_idx_map)
    for (category_idx, (category_id, category_name)) in categories_1_idx_list:
        categories_1_names.append("%s(%d)" % (category_name, category_id))

    clf.predict(X_test, y_test, categories_1_names)