Python LogUtil.log 예제들, utils.LogUtil.log Python 예제들

예제 #1

0

파일 보기

    def load_mul_features(feature_pt, feature_names, rawset_name, will_save):
        index_begin = 0
        features = None
        for index in reversed(range(1, len(feature_names))):
            f_names_s = '|'.join(
                feature_names[0:index + 1]) + '|' + rawset_name
            f_names_md5 = hashlib.md5(f_names_s).hexdigest()
            if isfile('%s/md5_%s.smat.npz' % (feature_pt, f_names_md5)):
                index_begin = index
                features = Feature.load('%s/md5_%s.smat' %
                                        (feature_pt, f_names_md5))
                break
        LogUtil.log(
            'INFO',
            'load %s features from index(%d)' % (rawset_name, index_begin))

        if 1 > index_begin:
            features = Feature.load(
                '%s/%s.%s.smat' % (feature_pt, feature_names[0], rawset_name))
        for index in range(index_begin + 1, len(feature_names)):
            features = Feature.merge_col(
                features,
                Feature.load('%s/%s.%s.smat' %
                             (feature_pt, feature_names[index], rawset_name)))

        features = features.tocsr()

        if will_save and (index_begin < len(feature_names) - 1):
            f_names_s = '|'.join(feature_names) + '|' + rawset_name
            f_names_md5 = hashlib.md5(f_names_s).hexdigest()
            Feature.save(features,
                         '%s/md5_%s.smat' % (feature_pt, f_names_md5))
        return features

예제 #2

0

파일 보기

 def load_smat(ft_fp):
     '''
     加载特征文件，特征文件格式如下：
     row_num col_num
     f1_index:f1_value f2_index:f2_value ...
     '''
     data = []
     indice = []
     indptr = [0]
     f = open(ft_fp)
     [row_num, col_num] = [int(num) for num in f.readline().strip().split()]
     for line in f:
         line = line.strip()
         subs = line.split()
         for sub in subs:
             [f_index, f_value] = sub.split(":")
             f_index = int(f_index)
             f_value = float(f_value)
             data.append(f_value)
             indice.append(f_index)
         indptr.append(len(data))
     f.close()
     features = csr_matrix((data, indice, indptr),
                           shape=(row_num, col_num),
                           dtype=float)
     LogUtil.log("INFO", "load smat feature file done (%s)" % ft_fp)
     return features

예제 #3

0

파일 보기

 def save_smat(features, ft_pt):
     '''
     存储特征文件
     '''
     (row_num, col_num) = features.shape
     data = features.data
     indice = features.indices
     indptr = features.indptr
     f = open(ft_pt, 'w')
     f.write("%d %d\n" % (row_num, col_num))
     ind_indptr = 1
     begin_line = True
     for ind_data in range(len(data)):
         while ind_data == indptr[ind_indptr]:
             f.write('\n')
             begin_line = True
             ind_indptr += 1
         if (data[ind_data] < 1e-12) and (data[ind_data] > -1e-12):
             continue
         if (not begin_line) and (ind_data != indptr[ind_indptr - 1]):
             f.write(' ')
         f.write("%d:%s" % (indice[ind_data], data[ind_data]))
         begin_line = False
     while ind_indptr < len(indptr):
         f.write("\n")
         ind_indptr += 1
     LogUtil.log("INFO", "save smat feature file done (%s)" % ft_pt)
     f.close()

예제 #4

0

파일 보기

 def save_smat(features, ft_pt):
     """
     save features to disk in SMAT format
     :param features: the matrix of features
     :param ft_pt: features file path
     :return: none
     """
     (row_num, col_num) = features.shape
     data = features.data
     indice = features.indices
     indptr = features.indptr
     f = open(ft_pt, 'w')
     f.write("%d %d\n" % (row_num, col_num))
     ind_indptr = 1
     begin_line = True
     for ind_data in range(len(data)):
         while ind_data == indptr[ind_indptr]:
             f.write('\n')
             begin_line = True
             ind_indptr += 1
         if (data[ind_data] < 1e-12) and (data[ind_data] > -1e-12):
             continue
         if (not begin_line) and (ind_data != indptr[ind_indptr - 1]):
             f.write(' ')
         f.write("%d:%s" % (indice[ind_data], data[ind_data]))
         begin_line = False
     while ind_indptr < len(indptr):
         f.write("\n")
         ind_indptr += 1
     LogUtil.log("INFO", "save smat feature file done (%s)" % ft_pt)
     f.close()

예제 #5

0

파일 보기

파일: featureprocessor.py 프로젝트: ZPilgrim/kaggle-quora-question-pairs

    def run_gen_feature_extra(cf):
        """
        生成额外训练数据
        :param conf_fp:
        :return:
        """
        # 读取配置文件
        # cf = ConfigParser.ConfigParser()
        # cf.read(conf_fp)
        feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt')
        feature_qp_names = Feature.get_feature_names_question_pair(cf)

        mc_indexs = FeatureProcessor.get_index_with_max_clique_size(cf, 'test', 4.)

        for f_name in feature_qp_names:
            feature_fp = '%s/%s.test.smat' % (feature_pt, f_name)
            feature_extra_fp = '%s/%s.train_extra.smat' % (feature_pt, f_name)

            has_extra = isfile(feature_extra_fp + ".npz")
            if not has_extra:
                features = Feature.load(feature_fp)
                features_extra = Feature.sample_row(features, mc_indexs)
                Feature.save_smat(features_extra, feature_extra_fp)
                LogUtil.log('INFO', '%s generate extra feature done' % f_name)
            else:
                LogUtil.log('INFO', '%s already has extra feature' % f_name)

예제 #6

0

파일 보기

파일: featureprocessor.py 프로젝트: ZPilgrim/kaggle-quora-question-pairs

    def run_gen_feature_with_swap(cf, argv):
        """
        生成线下特征文件，包含swap部分
        :return:
        """
        # 读取配置文件
        # cf = ConfigParser.ConfigParser()
        # cf.read(conf_fp)
        feature_pt = cf.get('DEFAULT', 'feature_question_pair_pt')

        feature_qp_names = Feature.get_feature_names_question_pair(cf)
        rawset_name = argv[0]

        for f_name in feature_qp_names:
            feature_fp = '%s/%s.%s.smat' % (feature_pt, f_name, rawset_name)
            feature_swap_fp = '%s/%s.%s_swap.smat' % (feature_pt, f_name, rawset_name)
            feature_with_swap_fp = '%s/%s.%s_with_swap.smat' % (feature_pt, f_name, rawset_name)

            has_with_swap = isfile(feature_with_swap_fp + '.npz')

            if not has_with_swap:
                features = Feature.load(feature_fp)
                features_swap = Feature.load(feature_swap_fp)
                features_with_swap = Feature.merge_row(features, features_swap)
                Feature.save(features_with_swap, feature_with_swap_fp)
                LogUtil.log('INFO', '%s generate with_swap feature done' % f_name)
            else:
                LogUtil.log('INFO', '%s already has with_swap feature' % f_name)

예제 #7

0

파일 보기

def generate(config, argv):
    data_name = argv[0]
    LogUtil.log('INFO', 'data_name=%s' % data_name)

    # load data set
    if 'offline' == data_name:
        # load offline valid dataset index
        valid_index_off_fp = '%s/%s.offline.index' % (config.get('DIRECTORY', 'index_pt'),
                                                      config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn'))
        valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int')
        valid_index_off = [num - 1 for num in valid_index_off]

        source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_train_set.txt'
        source_data = load_raw_line_from_file(config, source_file_path, valid_index_off)
    elif 'online' == data_name:
        source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_eval_set.txt'
        source_data = open(source_file_path, 'r').readlines()
    else:
        source_data = None

    feature_file_path = '%s/instance_fs_length.%s.smat' % (config.get('DIRECTORY', 'dataset_pt'), data_name)
    feature_file = open(feature_file_path, 'w')

    feature_file.write('%d %d\n' % (len(source_data), 4))
    for line in source_data:
        qid, tc, tw, dc, dw = parse_question_set(line)
        feature = list()
        feature.append(len(tc))
        feature.append(len(tw))
        feature.append(len(dc))
        feature.append(len(dw))
        Feature.save_feature(feature, feature_file)

    feature_file.close()

예제 #8

0

파일 보기

파일: feature.py 프로젝트: greitzmann/Question_pair

    def load_all(feature_pt, feature_names, rawset_name, will_save=False):
        index_begin = 0
        features = None
        for index in reversed(range(1, len(feature_names))):
            f_names_s = '|'.join(
                feature_names[0:index + 1]) + '|' + str(rawset_name)
            f_names_md5 = hashlib.md5(f_names_s.encode("utf8")).hexdigest()
            if isfile('%s/md5_%s.smat.npz' % (feature_pt, f_names_md5)):
                index_begin = index
                features = Feature.load('%s/md5_%s.smat' %
                                        (feature_pt, f_names_md5))
                break
        LogUtil.log(
            'INFO', 'load %s features [%s, %s)' %
            (rawset_name, feature_names[0], feature_names[index_begin]))

        if 1 > index_begin:
            features = Feature.load(
                '%s/%s.%s.smat' % (feature_pt, feature_names[0], rawset_name))
        for index in range(index_begin + 1, len(feature_names)):
            features = Feature.merge_col(
                features,
                Feature.load('%s/%s.%s.smat' %
                             (feature_pt, feature_names[index], rawset_name)))
        features = features.tocsr()

        return features

예제 #9

0

파일 보기

파일: btm.py 프로젝트: ZPilgrim/kaggle-quora-question-pairs

    def save_all_question2wids():
        """
        将train.csv、test.csv语句转化为word_id列表
        :return:
        """
        LogUtil.log('INFO', 'BEGIN: save all question2wids')
        # 读取配置文件
        cf = ConfigParser.ConfigParser()
        cf.read("../conf/python.conf")

        # 获取文件路径
        qid2question_question_fp = '%s/qid2question.all.question' % cf.get('DEFAULT', 'devel_pt')
        w2id_fp = '/home/houjianpeng/BTM/output/train_100_50/voca.txt'
        all_question_wids_fp = '/home/houjianpeng/BTM/output/train_100_50/all_doc_wids.txt'

        # 加载词典
        w2id = BTM.load_w2id(w2id_fp)

        all_question_f = open(qid2question_question_fp, 'r')
        all_question_wids_f = open(all_question_wids_fp, 'w')
        for line in all_question_f:
            ws = line.strip().split()
            wids = [w2id[w] for w in ws if w in w2id]
            print >> all_question_wids_f, ' '.join(map(str, wids))
        all_question_f.close()
        all_question_wids_f.close()
        LogUtil.log('INFO', 'END: save all question2wids')

예제 #10

0

파일 보기

 def load_npz(ft_fp):
     loader = np.load('%s.npz' % ft_fp)
     features = csr_matrix(
         (loader['data'], loader['indices'], loader['indptr']),
         shape=loader['shape'])
     LogUtil.log("INFO", "load npz feature file done (%s)" % ft_fp)
     return features

예제 #11

0

파일 보기

 def generate_batch(self, batch_size):
     n_batch = int(self.length / batch_size)
     if self.length % batch_size != 0:
         n_batch += 1
     LogUtil.log('INFO','{} {}'.format(n_batch, batch_size))
     slices = np.split(np.arange(n_batch * batch_size), n_batch)
     slices[-1] = slices[-1][:(self.length - batch_size * (n_batch - 1))]
     return slices

예제 #12

0

파일 보기

 def merge_col(features_1, features_2):
     '''
     纵向合并特征矩阵，即为每个实例增加特征
     '''
     features = hstack([features_1, features_2])
     (row_num, col_num) = features.shape
     LogUtil.log("INFO",
                 "merge col done, shape=(%d,%d)" % (row_num, col_num))
     return features

예제 #13

0

파일 보기

파일: btm_dis_features.py 프로젝트: zbxzc35/zhihu-machine-learning-challenge-2017

def generate(config, argv):
    # load valid dataset index
    valid_index_fp = '%s/%s.offline.index' % (config.get('DIRECTORY', 'index_pt'),
                                              config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn'))
    valid_index = DataUtil.load_vector(valid_index_fp, 'int')
    valid_index = [num - 1 for num in valid_index]

    # load topic btm vec
    topic_btm_vec = load_topic_btm_vec(config)

    # offline / online
    data_name = argv[0]

    dis_func_names = ["cosine",
                      "cityblock",
                      "jaccard",
                      "canberra",
                      "euclidean",
                      "minkowski",
                      "braycurtis"]

    btm_dis_feature_fn = ['vote_fs_btm_dis_%s' % dis_func_name for dis_func_name in dis_func_names]
    btm_dis_feature_f = [open('%s/%s.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'),
                                                fn,
                                                data_name), 'w') for fn in btm_dis_feature_fn]

    if 'offline' == data_name:
        btm_tw_cw_features = load_features_from_file(config, 'fs_btm_tw_cw', data_name, valid_index)
        LogUtil.log('INFO', 'load_features_from_file, len=%d' % len(btm_tw_cw_features))
        for line_id in range(len(btm_tw_cw_features)):
            doc_vec = btm_tw_cw_features[line_id]
            for dis_id in range(len(dis_func_names)):
                vec = [0.] * 1999
                for topic_id in range(1999):
                    topic_vec = topic_btm_vec[topic_id]
                    if 'minkowski' == dis_func_names[dis_id]:
                        vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec, 3)
                    else:
                        vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec)
                btm_dis_feature_f[dis_id].write('%s\n' % ','.join([str(num) for num in vec]))
    else:
        btm_vec_fp = '%s/fs_btm_tw_cw.%s.csv' % (config.get('DIRECTORY', 'dataset_pt'), data_name)
        btm_vec_f = open(btm_vec_fp, 'r')
        for line in btm_vec_f:
            doc_vec = np.nan_to_num(parse_feature_vec(line))
            for dis_id in range(len(dis_func_names)):
                vec = [0.] * 1999
                for topic_id in range(1999):
                    topic_vec = topic_btm_vec[topic_id]
                    if 'minkowski' == dis_func_names[dis_id]:
                        vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec, 3)
                    else:
                        vec[topic_id] = eval(dis_func_names[dis_id])(doc_vec, topic_vec)
                btm_dis_feature_f[dis_id].write('%s\n' % ','.join([str(num) for num in vec]))

    for f in btm_dis_feature_f:
        f.close()

예제 #14

0

파일 보기

 def load_index(fp):
     '''
     加载特征索引文件
     '''
     f = open(fp)
     indexs = [int(line) for line in f.readlines()]
     LogUtil.log("INFO", "load index done, len(index)=%d" % (len(indexs)))
     f.close()
     return indexs

예제 #15

0

파일 보기

파일: preprocessor.py 프로젝트: ZPilgrim/kaggle-quora-question-pairs

 def get_labels(df):
     """
     Get labels of data set
     :param df: original data set
     :return: label list of data set
     """
     labels = df['is_duplicate'].tolist()
     LogUtil.log("INFO", "num(1)=%d, num(0)=%d" % (sum(labels), len(labels) - sum(labels)))
     return labels

예제 #16

0

파일 보기

 def sample_row(features, indexs):
     '''
     根据索引行采样
     '''
     features_sampled = features[indexs, :]
     (row_num, col_num) = features_sampled.shape
     LogUtil.log("INFO",
                 "row sample done, shape=(%d,%d)" % (row_num, col_num))
     return features_sampled

예제 #17

0

파일 보기

파일: preprocessor.py 프로젝트: ZPilgrim/kaggle-quora-question-pairs

 def stat_dul_question(df):
     """
     Make statistics to duplication of questions
     :param df: original data set
     :return: none
     """
     questions = df['question1'].tolist() + df['question2'].tolist()
     len_questions = len(questions)
     len_uniq_questions = len(set(questions))
     LogUtil.log("INFO", "len(questions)=%d, len(unique_questions)=%d, rate=%f" % (
         len_questions, len_uniq_questions, 1.0 * len_uniq_questions / len_questions))

예제 #18

0

파일 보기

파일: test_taobao.py 프로젝트: webscrapist/spider

 def test_mytaobao(self):
     driver = self.driver
     self.test_login(driver)
     try:
         main_page = page.MainPage(driver)
         main_page.goto_profile_page()
     except Exception as e:
         self.testCaseInfo.errorinfo = repr(e)
         LogUtil.log(('Got error: ' + repr(e)))
     else:
         self.testCaseInfo.result = 'Pass'

예제 #19

0

파일 보기

파일: statistics.py 프로젝트: SJHBXShub/Are-you-happy

    def init_powerful_word_oside(pword, thresh_num, thresh_rate):
        pword_oside = []
        pword = filter(lambda x: x[1][0] * x[1][3] >= thresh_num, pword)

        pword_oside.extend(
            map(lambda x: x[0], filter(lambda x: x[1][4] >= thresh_rate,
                                       pword)))
        LogUtil.log(
            'INFO', 'One side power words(%d): %s' %
            (len(pword_oside), str(pword_oside)))
        return pword_oside

예제 #20

0

파일 보기

파일: test_taobao.py 프로젝트: webscrapist/spider

    def setUp(self):
        self.driver = webdriver.Chrome()
        self.base_url = 'http://www.taobao.com'
        self.testCaseInfo = TestCaseInfo(id='1',
                                         name=self.__str__(),
                                         owner='Oliver')
        self.testReport = TestReport()
        LogUtil.create_logger_file(__name__)

        self.testCaseInfo.starttime = common.get_current_time()
        LogUtil.log('Open base url: %s' % self.base_url)

예제 #21

0

파일 보기

파일: featureprocessor.py 프로젝트: ZPilgrim/kaggle-quora-question-pairs

 def run(cf, argv):
     cmd = argv[0]
     if 'run_gen_feature_with_swap' == cmd:
         FeatureProcessor.run_gen_feature_swap(cf, argv[1:])
         FeatureProcessor.run_gen_feature_with_swap(cf, argv[1:])
     elif 'run_gen_feature_extra' == cmd:
         FeatureProcessor.run_gen_feature_extra(cf)
     elif 'run_gen_feature_with_extra' == cmd:
         FeatureProcessor.run_gen_feature_with_extra(cf, argv[1:])
     else:
         LogUtil.log('WARNING', 'NO CMD (%s)' % cmd)

예제 #22

0

파일 보기

 def generateBOW(df_features,vocab_size):
     now = datetime.datetime.now()
     print(now.strftime('%Y-%m-%d %H:%M:%S'))        
     LogUtil.log("INFO", "Start to generate attribute BOW!")
     BagOfWordsExtractor = CountVectorizer(max_features=vocab_size,
                                         analyzer='word',
                                         lowercase=True)
     bow_features = BagOfWordsExtractor.fit_transform(df_features)
     print(now.strftime('%Y-%m-%d %H:%M:%S'))
     LogUtil.log("INFO", "End to generate attribute BOW!")        
     return bow_features.toarray()

예제 #23

0

파일 보기

def entropy_loss(labels, preds):
    epsilon = 1e-15
    s = 0.
    for idx, l in enumerate(labels):
        assert l == 1 or l == 0
        score = preds[idx]
        score = max(epsilon, score)
        score = min(1 - epsilon, score)
        s += - l * math.log(score) - (1. - l) * math.log(1 - score)
    s /= len(labels)
    LogUtil.log('INFO', 'Entropy loss : %f' % (s))
    return s

예제 #24

0

파일 보기

 def merge_col(features_1, features_2):
     """
     merge features made split by column
     :param features_1: the first part of features
     :param features_2: the second part of features
     :return: feature matrix
     """
     features = hstack([features_1, features_2])
     (row_num, col_num) = features.shape
     LogUtil.log("INFO",
                 "merge col done, shape=(%d,%d)" % (row_num, col_num))
     return features

예제 #25

0

파일 보기

파일: test_selenium_python.py 프로젝트: webscrapist/spider

 def test_repeat_next(self):
     try:
         driver = self.driver
         main_page = page.MainPage(driver)
         main_page.open(self.base_url)
         assert 'Selenium with Python' in main_page.page_source()
         main_page.repeat_next()
     except Exception as e:
         self.testCaseInfo.errorinfo = repr(e)
         LogUtil.log(('Got error: ' + repr(e)))
     else:
         self.testCaseInfo.result = 'Pass'

예제 #26

0

파일 보기

파일: test_selenium_python.py 프로젝트: webscrapist/spider

    def setUp(self):
        self.driver = webdriver.Chrome()
        # self.driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)
        self.base_url = "http://selenium-python.readthedocs.io/"
        self.testCaseInfo = TestCaseInfo(id='3',
                                         name=self.__str__(),
                                         owner='Oliver')
        self.testReport = TestReport()
        LogUtil.create_logger_file(__name__)

        self.testCaseInfo.starttime = common.get_current_time()
        LogUtil.log('Open base url: %s' % self.base_url)

예제 #27

0

파일 보기

 def merge_row(features_1, features_2):
     """
     横向合并特征矩阵，即合并两份数据集
     :param feature_1:
     :param feature_2:
     :return:
     """
     features = vstack([features_1, features_2])
     (row_num, col_num) = features.shape
     LogUtil.log("INFO",
                 "merge row done, shape=(%d,%d)" % (row_num, col_num))
     return features

예제 #28

0

파일 보기

 def sample_col(features, indexs):
     """
     根据索引列采样
     :param features:
     :param indexs:
     :return:
     """
     features_sampled = features[:, indexs]
     (row_num, col_num) = features_sampled.shape
     LogUtil.log("INFO",
                 "col sample done, shape=(%d,%d)" % (row_num, col_num))
     return features_sampled

예제 #29

0

파일 보기

def train_sgcn(hidden_size, label_size, n_nodes, sgcn_layer, id_list, all_label, batch_size, model_path,
               step_save_model=50, lr=0.001, epoch=10, window=4, gpu_id=0):
    path = ''
    import tensorflow as tf
    with tf.device('/cpu:0'):
    #with tf.device('/device:GPU:%d' % gpu_id):
        graph = tf.Graph()
        with graph.as_default():
            session_conf = tf.ConfigProto(
                allow_soft_placement=False,
                log_device_placement=False)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                model = SGCN(hidden_size, n_nodes, label_size, sgcn_layer, 0)
                global_step = tf.Variable(0, name="global_step", trainable=False)
                optimizer = tf.train.AdamOptimizer(lr)
                grads_and_vars = optimizer.compute_gradients(model.loss)
                train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
                best_eval_accuracy = 0.0
                saver = tf.train.Saver(tf.global_variables(), max_to_keep=3)
                init = tf.global_variables_initializer()
                sess.run(init)
                for epoch in range(epoch):
                    LogUtil.log("INFO", "epoch:{}".format(epoch))
                    LogUtil.log("INFO", "start training: {}".format(datetime.datetime.now()))
                    x_train_class = SGCNData(id_list, all_label, window)
                    slices = x_train_class.generate_batch(batch_size)
                    for step in range(len(slices)):
                        LogUtil.log('INFO','Training at step:{} '.format(step))
                        i = slices[step]
                        alias_inputs, A, items, node_masks, targets = x_train_class.get_slice(i)
                        targets_onehot = label_to_onehot(targets, label_size)
                        feed_dict = {
                            model.items: items,
                            model.A: A,
                            model.alias_input: alias_inputs,
                            model.node_masks: node_masks,
                            model.labels: targets_onehot,
                            model.dropout: 0.5
                        }
                        _, step, loss, accuracy = sess.run([train_op, global_step, model.loss, model.acc],
                                                           feed_dict)
                        current_step = tf.train.global_step(sess, global_step)
                        if current_step % step_save_model == 0:
                            time_str = datetime.datetime.now().isoformat()
                            LogUtil.log('INFO',
                                "{}: Training step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                            if accuracy > best_eval_accuracy:
                                best_eval_accuracy = accuracy
                                path = saver.save(sess, model_path, global_step=current_step)
                                LogUtil.log('INFO',"Saved model checkpoint to {}\n".format(path))
    return path

예제 #30

0

파일 보기

파일: instance_id_features.py 프로젝트: zbxzc35/zhihu-machine-learning-challenge-2017

def generate(config, argv):
    data_name = argv[0]

    word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(word_idf_fp, 'r') as word_idf_f:
        word_idf = json.load(word_idf_f)
    LogUtil.log("INFO", "load word_idf done, len(word_idf)=%d" % len(word_idf))

    char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(char_idf_fp, 'r') as char_idf_f:
        char_idf = json.load(char_idf_f)
    LogUtil.log("INFO", "load char_idf done, len(char_idf)=%d" % len(char_idf))

    # load data set
    if 'offline' == data_name:
        # load offline valid dataset index
        valid_index_off_fp = '%s/%s.offline.index' % (
            config.get('DIRECTORY', 'index_pt'),
            config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn'))
        valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int')
        valid_index_off = [num - 1 for num in valid_index_off]

        source_file_path = config.get('DIRECTORY',
                                      'source_pt') + '/question_train_set.txt'
        source_data = load_raw_line_from_file(config, source_file_path,
                                              valid_index_off)

        features = valid_index_off
    elif 'online' == data_name:

        source_file_path = config.get('DIRECTORY',
                                      'source_pt') + '/question_eval_set.txt'
        source_data = open(source_file_path, 'r').readlines()

        features = range(len(source_data))
    else:
        source_data = None
        features = None

    id_feature_file_path = '%s/instance_fs_id.%s.smat' % (config.get(
        'DIRECTORY', 'dataset_pt'), data_name)
    feature_file = open(id_feature_file_path, 'w')

    feature_file.write('%d %d\n' % (len(source_data), 1))
    for id_num in features:
        feature = list()

        feature.append(id_num % 100000)

        Feature.save_feature(feature, feature_file)

    feature_file.close()