예제 #1
0
def stat_news(db, slice_size=1000, _limit=None, _suffix='.count'):
    all_ids, num_ids = get_valid_ids(db, config.TABLE_NEWS, 'id')
    number_news = db.count(config.TABLE_NEWS)

    tag_map = init_stat_map(db, config.TABLE_TAG)
    category_map = init_stat_map(db, config.TABLE_CATEGORY)

    logger.info(
        '#{:d} news in {}-{}'.format(number_news, str(db), config.TABLE_NEWS))
    _limit = number_news if not _limit else min(_limit, number_news)
    all_ids.append(all_ids[-1] + 1)
    logger.info('#{:d} will be parsed'.format(_limit))

    for idx_start in xrange(0, _limit, slice_size):
        idx_end = min(idx_start + slice_size, _limit)
        id_start = all_ids[idx_start]
        id_end = all_ids[idx_end]
        query = 'SELECT tags, categories FROM {} WHERE id>={} and id<{' \
                '}'.format(config.TABLE_NEWS, id_start, id_end)
        cur_slice = db.execute(query)
        for tags, categories in cur_slice:
            stat_item(tag_map, tags)
            stat_item(category_map, categories, _drop_tail=True)
        logger.info('#{:6d}/{:d} stated'.format(idx_end, _limit))

    tag_file = config.pjoin(config.DATA_DIR, 'stat', 'tag' + _suffix)
    category_file = config.pjoin(config.DATA_DIR, 'stat', 'category' + _suffix)
    util.save_map(tag_map, tag_file, 'tag_stat')
    util.save_map(category_map, category_file, 'category_stat')
예제 #2
0
def get_segmentation_file(table_name, column_name, cut_all=True):
    sub_directory = '{}_segment'.format(table_name)
    sub_directory = pjoin(config.DATA_DIR, sub_directory)
    util.check_directory(sub_directory)
    segment_type = config.SEGMENT_FULL if cut_all else config.SEGMENT_PRECISE
    filename = '_'.join([table_name, column_name, segment_type])
    return pjoin(sub_directory, filename)
예제 #3
0
 def test_get_path(self):
     path1_gen = config.get_path('result')
     path1_true = config.pjoin(config.DATA_DIR, 'result')
     self.assertEqual(path1_gen, path1_true)
     path2_gen = config.get_path('result', 'subdir')
     path2_true = config.pjoin(config.DATA_DIR, 'result', 'subdir')
     self.assertEqual(path2_gen, path2_true)
예제 #4
0
 def test_stat_news(self):
     _suffix = '.test'
     stat_news.stat_news(self.db_local_large, _limit=1000, _suffix=_suffix)
     tag_file = config.pjoin(config.DATA_DIR, 'stat', 'tag' + _suffix)
     category_file = config.pjoin(config.DATA_DIR, 'stat',
                                  'category' + _suffix)
     self.assertTrue(exists(tag_file))
     self.assertTrue(exists(category_file))
예제 #5
0
    def test_detect_region(self):
        __affine__, __thres__ = config.HESSIAN_AFFINE
        _from = config.pjoin(config.DATA_TEST, 'ppm')
        _to = config.pjoin(config.DATA_TEST)

        subprocess.call([
            config.AFFINE_DETECTOR, '-{}'.format(__affine__), '-i', _from,
            '-o', _to, '-thres', __thres__
        ])
예제 #6
0
    def test_detect_region(self):
        __affine__, __thres__ = config.HESSIAN_AFFINE
        _from = config.pjoin(config.DATA_TEST, 'ppm')
        _to = config.pjoin(config.DATA_TEST)

        subprocess.call([config.AFFINE_DETECTOR,
                               '-{}'.format(__affine__),
                               '-i', _from,
                               '-o', _to,
                               '-thres', __thres__])
예제 #7
0
def draw_result_curve(all_evaluation, _classes=None, _useable=False):
    def save_plt_fig(_path, ext='png', close=True):
        if not _path.endswith(ext):
            _path = '{}.{}'.format(_path, ext)
        util.check_directory(os.path.dirname(_path))
        plt.savefig(_path)
        if close:
            plt.close()
        logger.info('image saved to: %s', _path)

    if _classes is None:
        _classes = encoder.classes_
    if type(_classes) is not list:
        _classes = list(_classes)
    _classes.append('All')

    useable_flag = 'useable' if _useable else 'all'
    plot_groups = [['hit', 'fp', 'miss'],
                   ['precision', 'recall', 'error_rate'],
                   ['gt_class_count', 'pred_class_count']]
    sub_dir = config.pjoin(config.RESULT_DIR,
                           'back_test_figures_{}_{}'.format(useable_flag,
                                                            args.date))
    util.check_directory(sub_dir)

    n_class = len(_classes)
    n_plot_groups = len(plot_groups)
    for _ in range(n_class):
        _class = _classes[_]
        plt.title('class={}'.format(_class.encode('utf8')))
        plt.xlabel('threshold')
        for i_plot in range(n_plot_groups):
            c_count = 0
            plots = []
            plot_group = plot_groups[i_plot]
            plt.rcParams["figure.figsize"] = [24.0, 10.0]
            plt.subplot(1, n_plot_groups, i_plot + 1)
            # plt.subplots_adjust(left=1.0, right=1.0, bottom=3.0, top=3.0)
            colors = []
            for type_ in plot_group:
                m_cur = [all_evaluation[t].get(type_)[_] for t in thresholds]
                plots.append(plt.plot(thresholds, m_cur, PLOT_COLORS[c_count]))
                colors.append('{}: {}'.format(PLOT_COLORS[c_count], type_))
                c_count += 1
            plt.xlabel('\n'.join(colors))
            plt.ylabel('/'.join(plot_group))
            plt.grid(True)
            # plt.legend(plots, plot_group,
            #            loc='lower left', numpoints=1)
        _file = config.pjoin(sub_dir, '{:02d}_{}.png'.format(_+1, _class))
        save_plt_fig(_file)
예제 #8
0
def clean_segmentation(_file=None, _type=None, _segment_type=None):
    if not _file:
        if not _type or not _segment_type:
            logger.error('At least one way to specify the segmentation file.')
        _filename = 'mp_news_{}_{}'.format(_type, _segment_type)
        _file = config.pjoin(config.DATA_DIR, 'mp_news_segment', _filename)
    logger.info('cleaning {}'.format(_file))
    _keys, _rows = data_util.load_news_area_info(db_local_large)
    num_rows = len(_rows)
    slice_size = num_rows // 100
    logger.info('#{} rows got from {}'.format(num_rows, str(db_local_large)))

    cleaned_file = _file + '_cleaned'
    f_cleaned = open(cleaned_file, 'wb')
    count = 0
    for _mp_id, _words in load_segmentation(_file):
        count += 1
        f_cleaned.write(str(_mp_id))
        f_cleaned.write('\t')
        f_cleaned.write((' '.join(_words)).encode('utf-8'))
        f_cleaned.write(__line_sep__)
        if count % slice_size == 0:
            logger.info(
                '{:=6d}[{:3d}%] done.'.format(count, count / slice_size))
    f_cleaned.flush()
    f_cleaned.close()
예제 #9
0
def get_model_path(model_set, model_name):
    """
    Get model path by model set and model name
    :param model_set: model collections under ${FASTER_RCNN_ROOT}/data
    :param model_name: model name without any suffix, i.e. vgg
    :return: model path and corresponding proto file path
    """
    model_root = pjoin(config.FASTER_RCNN_DIR, 'data')
    if model_set in ('fast', 'faster'):
        model_set += '_rcnn'
    model_set_dir = pjoin(model_root, '{}_models'.format(model_set))
    models = os.listdir(model_set_dir)
    for model in models:
        if model.lower().startswith(model_name.lower()):
            model_path = pjoin(model_set_dir, model)
            proto_path = pjoin(cfg.MODELS_DIR, model_name.upper(),
                               'faster_rcnn_alt_opt', 'faster_rcnn_test.pt')
            return model_path, proto_path
    return None, None
예제 #10
0
def load_stop_words(_stop_list='chinese', decode=None):
    _filename = '{}_stop_words.txt'.format(_stop_list)
    _stop_words_file = config.pjoin(config.DATA_DIR, 'dic', _filename)
    if decode:
        lines = [line.strip().decode('utf-8') for line in open(
            _stop_words_file, 'rb')]
    else:
        lines = [line.rstrip() for line in open(_stop_words_file, 'rb')]

    return set(lines)
예제 #11
0
파일: util.py 프로젝트: Dectinc/deep_vlad
def get_model_path(model_set, model_name):
    """
    Get model path by model set and model name
    :param model_set: model collections under ${FASTER_RCNN_ROOT}/data
    :param model_name: model name without any suffix, i.e. vgg
    :return: model path and corresponding proto file path
    """
    model_root = pjoin(config.FASTER_RCNN_DIR, 'data')
    if model_set in ('fast', 'faster'):
        model_set += '_rcnn'
    model_set_dir = pjoin(model_root, '{}_models'.format(model_set))
    models = os.listdir(model_set_dir)
    for model in models:
        if model.lower().startswith(model_name.lower()):
            model_path = pjoin(model_set_dir, model)
            proto_path = pjoin(cfg.MODELS_DIR, model_name.upper(),
                               'faster_rcnn_alt_opt',
                               'faster_rcnn_test.pt')
            return model_path, proto_path
    return None, None
예제 #12
0
파일: util.py 프로젝트: Dectinc/deep_vlad
 def walk_wrapper(path_from, path_to, _ext=None):
     path_from = os.path.abspath(path_from)
     path_to = os.path.abspath(path_to)
     if path_from[-1] != os.sep:
         path_from += os.sep
     logger.info('Walk in {}'.format(path_from))
     logger.info('Results to {}'.format(path_to))
     _prefix = os.path.commonprefix([os.path.split(path_from),
                                     os.path.split(path_to)])
     len_prefix = len(os.path.sep.join(_prefix))
     logger.info('Common prefix: {}'.format(_prefix))
     for cur_dir, dir_list, file_list in os.walk(path_from):
         dir_name = cur_dir[len(path_from):]
         to_dir = pjoin(path_to, dir_name)
         check_directory(to_dir)
         logger.info('Found directory: {}'.format(cur_dir))
         for _file in file_list:
             _from_file = pjoin(cur_dir, _file)
             if not _ext:
                 _to_file = pjoin(to_dir, _file)
             else:
                 _filename, _origin_ext = os.path.splitext(_file)
                 _to_file = pjoin(to_dir, '{}.{}'.format(_filename, _ext))
             if os.path.exists(_to_file):
                 logger.info('skip for exists: {}'.format(_to_file))
                 continue
             try:
                 func(_from_file, _to_file)
                 logger.info('[{}] from {} to {}'.format(
                     func.func_name,
                     '${{FROM}}{}'.format(_from_file[len_prefix:]),
                     '${{TO}}{}'.format(_to_file[len_prefix:])
                 ))
             except Exception, e:
                 logger.info('Failed [{}] {}, error msg: {}'.format(
                     func.func_name, _from_file, str(e)
                 ))
예제 #13
0
 def walk_wrapper(path_from, path_to, _ext=None):
     path_from = os.path.abspath(path_from)
     path_to = os.path.abspath(path_to)
     if path_from[-1] != os.sep:
         path_from += os.sep
     logger.info('Walk in {}'.format(path_from))
     logger.info('Results to {}'.format(path_to))
     _prefix = os.path.commonprefix(
         [os.path.split(path_from),
          os.path.split(path_to)])
     len_prefix = len(os.path.sep.join(_prefix))
     logger.info('Common prefix: {}'.format(_prefix))
     for cur_dir, dir_list, file_list in os.walk(path_from):
         dir_name = cur_dir[len(path_from):]
         to_dir = pjoin(path_to, dir_name)
         check_directory(to_dir)
         logger.info('Found directory: {}'.format(cur_dir))
         for _file in file_list:
             _from_file = pjoin(cur_dir, _file)
             if not _ext:
                 _to_file = pjoin(to_dir, _file)
             else:
                 _filename, _origin_ext = os.path.splitext(_file)
                 _to_file = pjoin(to_dir, '{}.{}'.format(_filename, _ext))
             if os.path.exists(_to_file):
                 logger.info('skip for exists: {}'.format(_to_file))
                 continue
             try:
                 func(_from_file, _to_file)
                 logger.info('[{}] from {} to {}'.format(
                     func.func_name,
                     '${{FROM}}{}'.format(_from_file[len_prefix:]),
                     '${{TO}}{}'.format(_to_file[len_prefix:])))
             except Exception, e:
                 logger.info('Failed [{}] {}, error msg: {}'.format(
                     func.func_name, _from_file, str(e)))
예제 #14
0
        train_samples = np.take(all_features, train_ids, axis=0)
        test_samples = np.take(all_features, test_ids, axis=0)
        evaluate(train_samples, train_labels, test_samples,
                 test_labels, _classifier, f)


def evaluate(train_samples, train_labels, test_samples, test_labels,
             classifier, f):
    count_values(train_labels)
    count_values(test_labels)
    classifier.fit(train_samples, train_labels)
    test_pred = classifier.predict(test_samples)
    experiment_util.calculate_result(test_labels, test_pred, f)


if __name__ == '__main__':
    util.check_directory(config.RESULT_DIR)

    args = parse_args()
    __filename = 'news_info_useable'
    news_info_file = config.pjoin(config.DATA_DIR, 'news', __filename)
    news_id_index, news_mp_index, id_map = config.News.load_info(news_info_file)
    mp_ids = sorted(news_mp_index.keys())
    logger.info('number of mp_ids: {}'.format(len(mp_ids)))

    if args.dry_run:
        mp_ids = random.sample(mp_ids, 200)
    logger.info('Arguments: {}'.format(args))

    experiment_manager()
def run_default():
    for _dataset in config.DATASETS:
        _from = pjoin(config.DATA_ROOT, _dataset)
        _to = pjoin(config.DATA_REGION_FEATURE_ROOT, _dataset)
        detect_region(_from, _to, __suffix__)
예제 #16
0
def evaluate(gt, pred, f=None):
    _name = 'back_test_evaluation_{}_{}_{}.txt'.format(args.date, args.limit,
                                                       get_time_str())
    util.check_directory(config.RESULT_DIR)
    f = open(config.pjoin(config.RESULT_DIR, _name), 'wb')

    def count_values(_values):
        counter = Counter(_values)
        logger.info(range(__n_class__))
        res = [counter[_] for _ in range(__n_class__)]
        res.append(sum(res))
        logger.info(res)
        return res

    def ilog(content):
        logger.info(content)
        iwrite(f, content)

    def _evaluate(cur_gt, cur_pred):
        num_gt, num_pred = len(cur_gt), len(cur_pred)
        ilog('#gt: {}'.format(num_gt))
        ilog(range(__n_class__))
        ilog(num_samples_by_class)
        ilog('#pred: {}'.format(num_pred))
        counter_pred = count_values(cur_pred.values())
        # ilog(counter_pred)
        res = ResultHolder(
            hit=[0] * __n_class__,
            fp=[0] * __n_class__,
            miss=[0] * __n_class__
        )
        hit, fp, miss = res.hit, res.fp, res.miss
        for k in cur_pred:
            if k in cur_gt:
                _gt_k = cur_gt.pop(k)
                if _gt_k == cur_pred[k]:
                    hit[_gt_k] += 1
                else:
                    miss[_gt_k] += 1
                    fp[cur_pred[k]] += 1
        for v in cur_gt.values():
            miss[v] += 1
        ilog('common prediction: {}'.format(hit))
        ilog('false prediction: {}'.format(fp))
        ilog('miss prediction: {}'.format(miss))
        for _ in ('hit', 'fp', 'miss'):
            res.get(_).append(sum(res.get(_)))
        sum_hit, sum_fp, sum_miss = hit[-1], fp[-1], miss[-1]
        ilog('Total hit/false/miss: {}/{}/{}'.format(sum_hit, sum_fp, sum_miss))

        def calc_percentage(cur, base):
            return cur * 1.0 / base if base > 0 else -1

        res.set('precision',
                [calc_percentage(h, h + p) for h, p in zip(hit, fp)])
        res.set('recall', [calc_percentage(h, num) for h, num in
                           zip(hit, num_samples_by_class)])
        res.set('error_rate', [calc_percentage(f, num) for f, num in
                               zip(fp, num_samples_by_class)])
        ilog('class precision:\n{}'.format(str(res.precision)))
        ilog('class recall:\n{}'.format(str(res.recall)))
        ilog('class error_rate:\n{}'.format(str(res.error_rate)))

        ilog('Precision: {:.2f}/{:.2f}'.format(
            calc_percentage(sum_hit, sum_hit + sum_fp),
            calc_percentage(sum_hit, num_pred)))
        ilog('Recall: {:.2f}/{:.2f}'.format(
            calc_percentage(sum_hit, num_gt),
            calc_percentage(sum_hit, num_samples)))
        res.set('gt_class_count', num_samples_by_class)
        res.set('pred_class_count', counter_pred)
        return res

    all_evaluation = {}

    ilog('=' * 80)
    num_samples = len(pred)
    ilog('= Back Test Evaluation, #samples={}'.format(num_samples))
    ilog('=' * 80)
    for only_useable in (True, False):
        cur_gt = {k: v[0] for k, v in gt.items() if v[1] == 1} \
            if only_useable else {k: v[0] for k, v in gt.items()}
        _flag = 'Useable' if only_useable else 'All'
        num_samples_by_class = count_values(cur_gt.values())
        for threshold in thresholds:
            ilog('=' * 60)
            ilog('= Back Test[{}] with Predict Thres[{}]'.format(
                _flag, threshold))
            ilog('=' * 60)
            cur_pred = {k: v[0] for k, v in pred.items() if v[1] >= threshold}
            all_evaluation[threshold] = _evaluate(cur_gt.copy(), cur_pred)
        draw_result_curve(all_evaluation, _useable=only_useable)
    ilog('== All evaluation done ==')
예제 #17
0
def get_api_predict_path():
    util.check_directory(config.RESULT_DIR)
    _name = 'api_predict_{}_{}.txt'.format(args.date, args.limit)
    return config.pjoin(config.RESULT_DIR, _name)
예제 #18
0
    _keys, _rows = data_util.load_news_area_info(db_local_large)
    num_rows = len(_rows)
    slice_size = num_rows // 100
    logger.info('#{} rows got from {}'.format(num_rows, str(db_local_large)))

    cleaned_file = _file + '_cleaned'
    f_cleaned = open(cleaned_file, 'wb')
    count = 0
    for _mp_id, _words in load_segmentation(_file):
        count += 1
        f_cleaned.write(str(_mp_id))
        f_cleaned.write('\t')
        f_cleaned.write((' '.join(_words)).encode('utf-8'))
        f_cleaned.write(__line_sep__)
        if count % slice_size == 0:
            logger.info(
                '{:=6d}[{:3d}%] done.'.format(count, count / slice_size))
    f_cleaned.flush()
    f_cleaned.close()


if __name__ == '__main__':
    db_local_large = config.local_mp_online

    mp_segment_dir = config.pjoin(config.DATA_DIR, 'mp_news_segment')
    filenames = os.listdir(mp_segment_dir)

    for filename in filenames:
        segmentation_file = config.pjoin(mp_segment_dir, filename)
        clean_segmentation(segmentation_file)
예제 #19
0
 def test_load_segmentation(self):
     _file = config.pjoin(config.DATA_DIR, 'test', 'test_file')
     logger.info(type(load_segmentation(_file)))
     for _line in load_segmentation(_file):
         logger.info(_line.rstrip())
예제 #20
0

@util.walk
def detect_region(_from, _to):
    """
    >> ./h_affine.ln -haraff -i img1.ppm -o img1.haraff -thres 1000
    >> ./h_affine.ln -hesaff -i img1.ppm -o img1.hesaff -thres 500
    """
    subprocess.call([
        AFFINE_DETECTOR, '-{}'.format(__affine__), '-i', _from, '-o', _to,
        '-thres',
        str(__thres__)
    ])


def run_default():
    for _dataset in config.DATASETS:
        _from = pjoin(config.DATA_PPM_ROOT, _dataset)
        _to = pjoin(config.DATA_REGION_ROOT, _dataset)
        detect_region(_from, _to, __suffix__)


if __name__ == '__main__':
    __affine__, __thres__ = config.HESSIAN_AFFINE
    __suffix__ = __affine__
    if len(sys.argv) > 1:
        detect_region(pjoin(config.DATA_PPM_ROOT, 'test'),
                      pjoin(config.DATA_REGION_ROOT, 'test'), __suffix__)
    else:
        run_default()
예제 #21
0
    output_file = pjoin(config.DATA_DIR, output_folder, output_filename)
    print 'Will output to:', output_file
    # def get_word_vector(_file, _model_name, _output_file, pooling=max):
    _extract_word2vec_representation(input_file, model_file, output_file,
                                     pooling=_pooling, norm=_norm)


if __name__ == '__main__':
    (index, model) = WORD2VEC_LIST[WORD2VEC_INDEX]
    # model_filename = 'model_{}_s{}_w{}_m{}_n{}_s{}.word2vec'.format(
    #     SG_VALUES[model.sg], model.vector_size, model.window,
    #     model.min_count, model.negative, '1e3')
    model_filename = config.FORMATTER_WORD2VEC_MODEL.format(
        SG_VALUES[model.sg], model.vector_size, model.window,
        model.min_count, model.negative, model.sample)
    model_file = config.pjoin(config.DATA_WORD2VEC_DIR, model_filename)
    ID_MAP = load_news_ids()
    num_samples = len(ID_MAP)
    id_list = [0 for _ in xrange(num_samples)]
    for _id in ID_MAP:
        id_list[ID_MAP[_id]] = _id

    # train_or_test_word2vec()

    for pooling in config.POOLING_METHODS:
        for norm in config.NORM_METHODS:
            for _type in ['content']:
                # for _type in ['title', 'brief', 'content']:
                extract_word2vec_representation(_type, pooling, norm)
                # extract_word2vec_representation('title', 'max', 'root')
                # extract_word2vec_representation('title', 'max', None)
예제 #22
0
def experiment_manager():
    _type = config.TYPE_CONTENT
    _segment_type = config.SEGMENT_PRECISE
    _pooling = config.AVERAGE_POOLING
    _norm = config.L2_NORM
    _vector_size = 800
    if 'LR' in args.classifiers:
        _classifier = LogisticRegressionCV(n_jobs=8, cv=5)
    elif 'svc_linear' in args.classifiers:
        _classifier = SVC(kernel='linear', probability=True)
    else:
        _classifier = LogisticRegressionCV(n_jobs=8, cv=5)

    __sample_flag = 'sample' if args.sample else 'no_sample'
    __name = '1v1_{}_{}_{}_{}.txt'.format(__sample_flag,
                                          type(_classifier).__name__,
                                          date.today(), time())
    result_file = config.pjoin(config.RESULT_DIR, __name)
    logger.info('result file: {}'.format(result_file))
    f = open(result_file, 'wb')
    iwrite(f, '\nClassifier Info\n')
    iwrite(f, _classifier)

    global all_features
    all_features = load_all_feature(_type, _vector_size, _pooling,
                                    _norm, _segment_type)

    all_ids = np.array([id_map[_] for _ in mp_ids])
    all_labels = np.array([news_id_index[_].area_id for _ in all_ids])
    count_values(all_labels)
    all_labels = np.array([area_id_index[_].name for _ in all_labels])
    count_values(all_labels)
    label_encoder = get_label_encoder()
    all_labels = label_encoder.transform(all_labels)
    num_class = len(label_encoder.classes_)

    for i_class in range(num_class):
        cur_class_ids = [all_ids[_] for _ in xrange(len(all_labels)) if
                         all_labels[_] == i_class]
        train_id_cur, test_id_cur = experiment_util.split_train_test_set(
            cur_class_ids)
        rest_ids_all = list(set(all_ids) - set(cur_class_ids))
        # rest_ids_all = [_ for _ in all_ids if _ not in cur_class_ids]
        if args.sample:
            random.shuffle(rest_ids_all)
            rest_ids = rest_ids_all[:len(cur_class_ids)]
        else:
            rest_ids = rest_ids_all
        train_id_rest, _ = experiment_util.split_train_test_set(
            rest_ids)
        test_id_rest = list(set(rest_ids_all) - set(train_id_rest))
        # test_id_rest = [_ for _ in rest_ids_all if _ not in train_id_rest]
        train_ids = train_id_cur + train_id_rest
        test_ids = test_id_cur + test_id_rest
        train_labels = [0] * len(train_id_cur) + [1] * len(train_id_rest)
        test_labels = [0] * len(test_id_cur) + [1] * len(test_id_rest)
        logger.info('Train {}/{} || Test {}/{}'.format(len(train_id_cur),
                                                       len(train_id_rest),
                                                       len(test_id_cur),
                                                       len(test_id_rest)))
        _msg = 'Class: {}[{}]'.format(i_class,
                                      label_encoder.inverse_transform(i_class))
        logger.info(_msg)
        iwrite(f, _msg)
        train_samples = np.take(all_features, train_ids, axis=0)
        test_samples = np.take(all_features, test_ids, axis=0)
        evaluate(train_samples, train_labels, test_samples,
                 test_labels, _classifier, f)
예제 #23
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# @filename  generate_id
# @author   [email protected]
# @date     2016-03-16 16:17

from os import linesep

from core.data.data_util import mysql_batch_job_wrapper
from core.data.get_word_vector import SampleIterator
from core.util import LoggerUtil
from core.util import config

logger = LoggerUtil.get_logger(__file__.split('/')[-1][:-3])

sample_file = config.pjoin(config.DATA_DIR, 'mp_news_segment',
                           'mp_news_title_precise_cleaned')
id_file = config.pjoin(config.DATA_DIR, 'news', 'news_info')

if False:
    count = 0
    with open(id_file, 'wb') as f:
        for _id, _words in SampleIterator(sample_file):
            f.write(str(count))
            f.write('\t')
            f.write(str(_id))
            f.write(linesep)
            count += 1

id_map = {}
id_list = []
for line in open(id_file):
예제 #24
0
    logger.info('calc TF-IDF done: {}'.format(end_time - start_time))

    # save statistics
    _result_dir = config.pjoin(config.DATA_STAT_DIR, os.path.basename(_file))
    util.check_directory(_result_dir)
    vectorizer_file = config.pjoin(_result_dir, 'vectorization')
    util.save_sparse_csr_matrix(vectorizer_file, vectorizer_result)
    logger.info('vectorization file saved: {}'.format(vectorizer_file))
    tfidf_file = config.pjoin(_result_dir, 'tfidf')
    util.save_sparse_csr_matrix(tfidf_file, tfidf)
    logger.info('tfidf file saved: {}'.format(tfidf_file))
    words_file = config.pjoin(_result_dir, 'all_words')
    with open(words_file, 'wb') as f:
        for word in all_words:
            f.write(word.encode('utf-8'))
            f.write(os.linesep)
    logger.info('words file saved: {}'.format(words_file))
    id_map_file = config.pjoin(_result_dir, 'id_map')
    util.save_map(news_id_map, id_map_file)
    logger.info('news id map saved: {}'.format(id_map_file))

    return news_id_map, vectorizer_result, all_words, tfidf


if __name__ == '__main__':
    db_local_large = config.local_mp_online

    segmentation_file = config.pjoin(config.DATA_DIR, 'mp_news_segment',
                                     'mp_news_content_precise')
    stat_tf_idf(segmentation_file)
예제 #25
0
def stat_tf_idf(_file=None, _type=None, _segment_type=None):
    """
    For a certain segmentation type of [brief/content/title], vectorization
    all the corpus text and output tf-idf results
    :param _file: segmentation file path, will be replaced with the
    auto-generated one if None is given
    :param _type: chosen from brief/content/title
    :param _segment_type: chosen from full/precise
    :return:
    """
    if not _file:
        if not _type or not _segment_type:
            logger.error('At least one way to specify the segmentation file.')
        _filename = 'mp_news_{}_{}'.format(_type, _segment_type)
        _file = config.pjoin(config.DATA_DIR, 'mp_news_segment', _filename)
    _keys, _rows = data_util.load_news_area_info(db_local_large)
    num_rows = len(_rows)
    slice_size = num_rows // 100
    logger.info('#{} rows got from {}'.format(num_rows, str(db_local_large)))

    news_id_map = {}
    count = 0
    _corpus = []
    for _mp_id, _words in load_segmentation(_file):
        news_id_map[_mp_id] = count
        count += 1
        _corpus.append(' '.join(_words))
        if count % slice_size == 0:
            logger.info(
                '{:=6d}[{:3d}%] done.'.format(count, count / slice_size))

    vectorizer = CountVectorizer(decode_error='ignore')
    transformer = TfidfTransformer()

    start_time = time.time()
    vectorizer_result = vectorizer.fit_transform(_corpus)
    end_time = time.time()
    logger.info('vectorization done: {}'.format(end_time - start_time))
    all_words = vectorizer.get_feature_names()
    logger.info('#{:d} words in total'.format(len(all_words)))

    start_time = time.time()
    tfidf = transformer.fit_transform(vectorizer_result)
    end_time = time.time()
    logger.info('calc TF-IDF done: {}'.format(end_time - start_time))

    # save statistics
    _result_dir = config.pjoin(config.DATA_STAT_DIR, os.path.basename(_file))
    util.check_directory(_result_dir)
    vectorizer_file = config.pjoin(_result_dir, 'vectorization')
    util.save_sparse_csr_matrix(vectorizer_file, vectorizer_result)
    logger.info('vectorization file saved: {}'.format(vectorizer_file))
    tfidf_file = config.pjoin(_result_dir, 'tfidf')
    util.save_sparse_csr_matrix(tfidf_file, tfidf)
    logger.info('tfidf file saved: {}'.format(tfidf_file))
    words_file = config.pjoin(_result_dir, 'all_words')
    with open(words_file, 'wb') as f:
        for word in all_words:
            f.write(word.encode('utf-8'))
            f.write(os.linesep)
    logger.info('words file saved: {}'.format(words_file))
    id_map_file = config.pjoin(_result_dir, 'id_map')
    util.save_map(news_id_map, id_map_file)
    logger.info('news id map saved: {}'.format(id_map_file))

    return news_id_map, vectorizer_result, all_words, tfidf
예제 #26
0
def load_label_encoder(_name='encoder_20_2016-04-05.pkl'):
    _file = config.pjoin(config.DATA_CLASSIFIER_DIR, 'label_encoder', _name)
    return pickle.load(open(_file, 'rb'))
예제 #27
0
def get_all_result_path():
    util.check_directory(config.RESULT_DIR)
    _name = 'back_test_result_{}_{}.pkl'.format(args.date, args.limit)
    return config.pjoin(config.RESULT_DIR, _name)
예제 #28
0
def run_default():
    for _dataset in DATASETS:
        _from = pjoin(DATA_ROOT, _dataset)
        _to = pjoin(DATA_PPM_ROOT, _dataset)
        convert_image(_from, _to, 'ppm')
예제 #29
0
def run_default():
    for _dataset in DATASETS:
        _from = pjoin(DATA_ROOT, _dataset)
        _to = pjoin(DATA_PPM_ROOT, _dataset)
        convert_image(_from, _to, 'ppm')
from core.util.config import AFFINE_EXTRACTOR, pjoin

logger = LoggerUtil.get_logger(__file__.split('/')[-1][:-3])


@util.walk
def detect_region(_from, _to):
    """
    >> ./h_affine.ln -haraff -i img1.ppm -o img1.haraff -thres 1000
    >> ./h_affine.ln -hesaff -i img1.ppm -o img1.hesaff -thres 500
    """
    subprocess.call([AFFINE_EXTRACTOR, _from])


def run_default():
    for _dataset in config.DATASETS:
        _from = pjoin(config.DATA_ROOT, _dataset)
        _to = pjoin(config.DATA_REGION_FEATURE_ROOT, _dataset)
        detect_region(_from, _to, __suffix__)


if __name__ == '__main__':
    __affine__, __thres__ = config.HESSIAN_AFFINE
    __suffix__ = __affine__
    if len(sys.argv) > 1:
        detect_region(pjoin(config.DATA_ROOT, 'test'),
                      pjoin(config.DATA_REGION_FEATURE_ROOT, 'test'),
                      __suffix__)
    else:
        run_default()
예제 #31
0
def get_label_encoder(_name='encoder_20_2016-04-05'):
    import pickle
    _file = '{}.pkl'.format(_name)
    _path = config.pjoin(config.DATA_CLASSIFIER_DIR, 'label_encoder', _file)
    return pickle.load(open(_path, 'rb'))
예제 #32
0
import _init_paths
import subprocess
import sys

from core.util import LoggerUtil
from core.util import util
from core.util.config import pjoin, DATASETS, DATA_ROOT, DATA_PPM_ROOT

logger = LoggerUtil.get_logger(__file__.split('/')[-1][:-3])


@util.walk
def convert_image(_from, _to):
    subprocess.check_call(['convert', _from, _to])


def run_default():
    for _dataset in DATASETS:
        _from = pjoin(DATA_ROOT, _dataset)
        _to = pjoin(DATA_PPM_ROOT, _dataset)
        convert_image(_from, _to, 'ppm')


if __name__ == '__main__':
    if len(sys.argv) > 1:
        convert_image(pjoin(DATA_ROOT, 'test'), pjoin(DATA_PPM_ROOT, 'test'),
                      'ppm')
    else:
        run_default()
예제 #33
0

@util.walk
def detect_region(_from, _to):
    """
    >> ./h_affine.ln -haraff -i img1.ppm -o img1.haraff -thres 1000
    >> ./h_affine.ln -hesaff -i img1.ppm -o img1.hesaff -thres 500
    """
    subprocess.call([AFFINE_DETECTOR, '-{}'.format(__affine__),
                           '-i', _from,
                           '-o', _to,
                           '-thres', str(__thres__)])


def run_default():
    for _dataset in config.DATASETS:
        _from = pjoin(config.DATA_PPM_ROOT, _dataset)
        _to = pjoin(config.DATA_REGION_ROOT, _dataset)
        detect_region(_from, _to, __suffix__)


if __name__ == '__main__':
    __affine__, __thres__ = config.HESSIAN_AFFINE
    __suffix__ = __affine__
    if len(sys.argv) > 1:
        detect_region(pjoin(config.DATA_PPM_ROOT, 'test'),
                      pjoin(config.DATA_REGION_ROOT, 'test'),
                      __suffix__)
    else:
        run_default()
예제 #34
0
import _init_paths
import subprocess
import sys

from core.util import LoggerUtil
from core.util import util
from core.util.config import pjoin, DATASETS, DATA_ROOT, DATA_PPM_ROOT

logger = LoggerUtil.get_logger(__file__.split('/')[-1][:-3])


@util.walk
def convert_image(_from, _to):
    subprocess.check_call(['convert', _from, _to])


def run_default():
    for _dataset in DATASETS:
        _from = pjoin(DATA_ROOT, _dataset)
        _to = pjoin(DATA_PPM_ROOT, _dataset)
        convert_image(_from, _to, 'ppm')


if __name__ == '__main__':
    if len(sys.argv) > 1:
        convert_image(pjoin(DATA_ROOT, 'test'),
                      pjoin(DATA_PPM_ROOT, 'test'),
                      'ppm')
    else:
        run_default()