예제 #1
0
def main(unused_argv):
    logging.set_verbosity(logging.INFO)

    is_train = FLAGS.is_train
    is_tuning_hyper_para = FLAGS.is_tuning_hyper_para

    if is_train:
        ks = FLAGS.ks
        k_list = [int(k.strip()) for k in ks.split(',')]

        compute_prior_posterior_prob(k_list=k_list,
                                     opt_hyper_para=is_tuning_hyper_para)
    else:
        model_dir = FLAGS.model_dir
        k = FLAGS.pred_k

        output_file = FLAGS.output_file
        top_k = FLAGS.top_k

        model_type = FLAGS.model_type
        batch_size = FLAGS.batch_size
        num_readers = FLAGS.num_readers
        feature_names = FLAGS.feature_names
        feature_sizes = FLAGS.feature_sizes

        test_data_pattern = FLAGS.test_data_pattern
        reader = get_reader(model_type, feature_names, feature_sizes)
        test_data_pipeline = DataPipeline(reader=reader,
                                          data_pattern=test_data_pattern,
                                          batch_size=batch_size,
                                          num_readers=num_readers)

        train_data_pattern = FLAGS.train_data_pattern
        inner_reader = get_reader(model_type, feature_names, feature_sizes)
        train_data_pipeline = DataPipeline(reader=inner_reader,
                                           data_pattern=train_data_pattern,
                                           batch_size=batch_size,
                                           num_readers=num_readers)

        pred_obj = Predict(train_data_pipeline, model_dir, k=k)
        pred_obj.make_predictions(test_data_pipeline, output_file, top_k=top_k)
예제 #2
0
 def _read_text_from_file(self, file):
     '''
     read files using reader from readers.py
     '''
     filetype = os.path.splitext(file)[1]
     reader = get_reader(filetype)
     try:
         return reader(file).read()
     except NotImplementedError:
         self._errors.append(
             '"{}" files are not supported..'.format(filetype))
         return ''
예제 #3
0
def main(unused_argv):
    logging.set_verbosity(tf.logging.INFO)
    reader = get_reader(FLAGS.feature_names,
                        FLAGS.feature_sizes,
                        FLAGS.frame_features)
    if FLAGS.output_file is "":
        raise ValueError("'output_file' was not specified. "\
                         "Unable to continue with inference.")
    if FLAGS.test_data_pattern is "":
        raise ValueError("'test_data_pattern' was not specified. "\
                         "Unable to continue with inference.")
    inference(reader, FLAGS.train_dir, FLAGS.test_data_pattern,
              FLAGS.output_file, FLAGS.batch_size, FLAGS.top_k)
예제 #4
0
def load_data(fn, options, max_chars=None):
    read = get_reader(options.input_format)
    texts, labels = [], []
    with open(fn) as f:
        for ln, (text, text_labels) in enumerate(read(f, fn), start=1):
            if options.multiclass and not text_labels:
                raise ValueError(f'missing label on line {ln} in {fn}: {l}')
            elif options.multiclass and len(text_labels) > 1:
                raise ValueError(f'multiple labels on line {ln} in {fn}: {l}')
            texts.append(text[:max_chars])
            labels.append(text_labels)
    print(f'loaded {len(texts)} examples from {fn}', file=sys.stderr)
    return texts, labels
예제 #5
0
def main(_):
    logging.set_verbosity(logging.INFO)
    # Where training checkpoints are stored.
    train_model_dirs = FLAGS.train_model_dirs
    out_file_location = FLAGS.output_file
    top_k = FLAGS.top_k
    test_data_pattern = FLAGS.test_data_pattern
    model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes
    reader = get_reader(model_type, feature_names, feature_sizes)
    batch_size = FLAGS.batch_size
    num_readers = FLAGS.num_readers

    train_model_dirs_list = [e.strip() for e in train_model_dirs.split(',')]
    # Get test data.
    test_data_pipeline = DataPipeline(reader=reader, data_pattern=test_data_pattern,
                                      batch_size=batch_size, num_readers=num_readers)

    # Make inference.
    inference = BootstrapInference(train_model_dirs_list)
    inference.transform(test_data_pipeline, out_file_location, top_k=top_k)
예제 #6
0
def main(unused_argv):
    env = json.loads(os.environ.get("TF_CONFIG", "{}"))
    cluster_data = env.get("cluster", None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get("task", None) or {"type": "master", "index": 0}
    task = type("TaskSpec", (object, ), task_data)
    logging.set_verbosity(tf.logging.INFO)
    logging.info(f"{str_task(task)}: Tensorflow version: {tf.__version__}.")
    if not cluster or task.type == "master" or task.type == "worker":
        model = find_class_by_name(FLAGS.model,
                                   [frame_level_models, video_level_models])()
        reader = get_reader(FLAGS.feature_names, FLAGS.feature_sizes,
                            FLAGS.frame_features)
        model_exporter = ModelExporter(frame_features=FLAGS.frame_features,
                                       model=model,
                                       reader=reader)
        Trainer(cluster, task, FLAGS.train_dir, model, reader, model_exporter,
                FLAGS.log_device_placement, FLAGS.export_model_steps,
                FLAGS.max_steps).run(start_new_model=FLAGS.start_new_model)
    elif task.type == "ps":
        ParameterServer(cluster, task).run()
    else:
        raise ValueError(f"{str_task(task)}: Invalid task_type: {task.type}.")
예제 #7
0
def main(unused_argv):
    """
    Training.
    init_learning_rate: Initial learning rate.
    decay_steps: How many training steps to decay learning rate once.
    decay_rate: How much to decay learning rate.
    l2_reg_rate: l2 regularization rate.
    epochs: The maximal epochs to pass all training data.
    """
    logging.set_verbosity(logging.INFO)

    output_dir = FLAGS.output_dir
    start_new_model = FLAGS.start_new_model

    init_learning_rate = FLAGS.init_learning_rate
    decay_steps = FLAGS.decay_steps
    decay_rate = FLAGS.decay_rate
    l2_reg_rate = FLAGS.l2_reg_rate
    train_epochs = FLAGS.train_epochs

    model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes
    reader = get_reader(model_type, feature_names, feature_sizes)
    train_data_pattern = FLAGS.train_data_pattern
    validate_data_pattern = FLAGS.validate_data_pattern
    batch_size = FLAGS.batch_size
    num_readers = FLAGS.num_readers
    init_with_linear_clf = FLAGS.init_with_linear_clf
    is_bootstrap = FLAGS.is_bootstrap

    # Increase num_readers.
    validate_data_pipeline = DataPipeline(reader=reader,
                                          data_pattern=validate_data_pattern,
                                          batch_size=batch_size,
                                          num_readers=num_readers)

    if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')):
        with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f:
            validate_data = pickle.load(f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f:
            validate_labels = pickle.load(f)
    else:
        # Sample validate set for line search in linear classifier or logistic regression early stopping.
        _, validate_data, validate_labels, _ = random_sample(
            0.05,
            mask=(False, True, True, False),
            data_pipeline=validate_data_pipeline)
        with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f:
            pickle.dump(validate_data, f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f:
            pickle.dump(validate_labels, f)

    start_new_model = start_new_model or (not tf.gfile.Exists(output_dir))

    # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers.
    try:
        # Load sum_labels in training set, numpy float format to compute pos_weights.
        train_sum_labels = load_sum_labels()
        # num_neg / num_pos, assuming neg_weights === 1.0.
        pos_weights = np.sqrt(
            (float(NUM_TRAIN_EXAMPLES) - train_sum_labels) / train_sum_labels)
        logging.info(
            'Computing pos_weights based on sum_labels in train set successfully.'
        )
    except IOError:
        logging.error('Cannot load train sum_labels. Use default value.')
        pos_weights = None
    finally:
        logging.error('Disable pos_weights.')
        # Set it as None to disable pos_weights.
        pos_weights = None

    train_data_pipeline = DataPipeline(reader=reader,
                                       data_pattern=train_data_pattern,
                                       batch_size=batch_size,
                                       num_readers=num_readers)
    if start_new_model:
        # Load train data mean and std.
        train_features_mean, train_features_var = load_features_mean_var(
            reader)

        tr_data_fn = standard_scale
        tr_data_paras = {
            'mean': train_features_mean,
            'variance': train_features_var,
            'reshape': False,
            'size': None
        }

        if init_with_linear_clf:
            # ...Start linear classifier...
            # Compute weights and biases of linear classifier using normal equation.
            # Linear search helps little.
            linear_clf = LinearClassifier(
                logdir=path_join(output_dir, 'linear_classifier'))
            linear_clf.fit(data_pipeline=train_data_pipeline,
                           tr_data_fn=tr_data_fn,
                           tr_data_paras=tr_data_paras,
                           l2_regs=[
                               0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
                               1.0, 10.0, 100.0, 1000.0
                           ],
                           validate_set=(validate_data, validate_labels),
                           line_search=True)
            linear_clf_weights, linear_clf_biases = linear_clf.weights, linear_clf.biases

            logging.info(
                'linear classifier weights and biases with shape {}, {}'.
                format(linear_clf_weights.shape, linear_clf_biases.shape))
            logging.debug(
                'linear classifier weights and {} biases: {}.'.format(
                    linear_clf_weights, linear_clf_biases))
            # ...Exit linear classifier...
        else:
            linear_clf_weights, linear_clf_biases = None, None
    else:
        linear_clf_weights, linear_clf_biases = None, None
        tr_data_fn = None
        tr_data_paras = None

    # Run logistic regression.
    log_reg = LogisticRegression(logdir=path_join(output_dir, 'log_reg'))
    log_reg.fit(train_data_pipeline,
                start_new_model=start_new_model,
                tr_data_fn=tr_data_fn,
                tr_data_paras=tr_data_paras,
                validate_set=(validate_data, validate_labels),
                validate_fn=gap_fn,
                bootstrap=is_bootstrap,
                init_learning_rate=init_learning_rate,
                decay_steps=decay_steps,
                decay_rate=decay_rate,
                epochs=train_epochs,
                l2_reg_rate=l2_reg_rate,
                pos_weights=pos_weights,
                initial_weights=linear_clf_weights,
                initial_biases=linear_clf_biases)
예제 #8
0
def compute_prior_posterior_prob(k_list=[8],
                                 smooth_para=1.0,
                                 opt_hyper_para=False):
    if (not opt_hyper_para) and (len(k_list) != 1):
        raise ValueError('Only one k is needed. Check your argument.')

    model_dir = FLAGS.model_dir

    model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes
    reader = get_reader(model_type, feature_names, feature_sizes)

    train_data_pattern = FLAGS.train_data_pattern
    batch_size = FLAGS.batch_size
    num_readers = FLAGS.num_readers

    train_data_pipeline = DataPipeline(reader=reader,
                                       data_pattern=train_data_pattern,
                                       batch_size=batch_size,
                                       num_readers=num_readers)

    # Step 1. Compute prior probabilities and store the results.
    start_time = time.time()
    sum_labels, accum_num_videos, labels_prior_prob = compute_prior_prob(
        train_data_pipeline, smooth_para=smooth_para)
    logging.info('Computing prior probability took {} s.'.format(time.time() -
                                                                 start_time))
    save_prior_prob(sum_labels, accum_num_videos, labels_prior_prob, model_dir)

    # Step 2. Compute posterior probabilities, actually likelihood function or sampling distribution.
    # Total number of classes.
    num_classes = reader.num_classes
    range_num_classes = range(num_classes)

    max_k = max(k_list)
    # For each possible class, define a count and counter_count to count.
    # Compute the posterior probability, namely, given a label l, counting the number of training examples that have
    # exactly j (0 <= j <= k) nearest neighbors that have label l and normalizing it.
    # Here, j is considered as a random variable.
    count_list, counter_count_list = [], []
    for k in k_list:
        count_list.append(np.zeros([k + 1, num_classes], dtype=np.float32))
        counter_count_list.append(
            np.zeros([k + 1, num_classes], dtype=np.float32))

    with tf.Graph().as_default() as g:
        global_step = tf.Variable(0,
                                  trainable=False,
                                  dtype=tf.int64,
                                  name='global_step')
        global_step_inc_op = global_step.assign_add(1)

        video_id_batch, video_batch, video_labels_batch, num_frames_batch = (
            get_input_data_tensors(train_data_pipeline,
                                   num_epochs=1,
                                   name_scope='outer_loop'))

        tf.summary.scalar('global_step', global_step)

        summary_op = tf.summary.merge_all()

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())

    sess = tf.Session(graph=g)
    sess.run(init_op)

    writer = tf.summary.FileWriter(model_dir, graph=sess.graph)

    inner_reader = get_reader(model_type, feature_names, feature_sizes)

    # Be cautious to not be blocked by queue.
    # Start input enqueue threads.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    tol_num_examples_processed = 0

    try:
        while not coord.should_stop():
            # Run training steps or whatever.
            start_time = time.time()
            video_id_batch_val, video_batch_val, video_labels_batch_val, global_step_val, summary = sess.run(
                [
                    video_id_batch, video_batch, video_labels_batch,
                    global_step_inc_op, summary_op
                ])

            writer.add_summary(summary, global_step=global_step_val)

            logging.info(
                'video_id_batch shape: {}, video_batch shape: {}'.format(
                    video_id_batch_val.shape, video_batch_val.shape))
            # Smaller batch size and less number of readers.
            _train_data_pipeline = DataPipeline(
                reader=inner_reader,
                data_pattern=train_data_pattern,
                batch_size=batch_size,
                num_readers=num_readers)
            # Pass values instead of tensors.
            top_max_k_video_ids, top_max_k_labels = find_k_nearest_neighbors(
                video_id_batch_val,
                video_batch_val,
                _train_data_pipeline,
                is_train=True,
                k=max_k)
            logging.info(
                'Finding k nearest neighbors needs {} s.'.format(time.time() -
                                                                 start_time))
            # logging.debug('topk_video_ids: {}\ntopk_labels: {}'.format(topk_video_ids, topk_labels))

            # Update count_list and counter_count_list.
            for idx, k in enumerate(k_list):
                topk_labels = top_max_k_labels[:, :k]
                # batch_size * delta.
                deltas = topk_labels.astype(np.int32).sum(axis=1)
                # Update count and counter_count for each example.
                for delta, video_labels_val in zip(deltas,
                                                   video_labels_batch_val):
                    inc = video_labels_val.astype(np.float32)
                    count_list[idx][delta, range_num_classes] += inc
                    counter_count_list[idx][delta,
                                            range_num_classes] += 1 - inc

                # logging.debug('count: {}\ncounter_count: {}'.format(count_list[idx], counter_count_list[idx]))

            now = time.time()
            tol_num_examples_processed += video_id_batch_val.shape[0]
            logging.info(
                'Batch processing step {}, elapsed {} s, processed {} examples in total'
                .format(global_step_val, now - start_time,
                        tol_num_examples_processed))

            # Save results regularly.
            if global_step_val % 4 == 0:
                # Save models parameters.
                for k, count, counter_count in zip(k_list, count_list,
                                                   counter_count_list):
                    # Compute posterior probabilities.
                    pos_prob_positive = (smooth_para +
                                         count) / (smooth_para *
                                                   (k + 1) + count.sum(axis=0))
                    pos_prob_negative = (smooth_para + counter_count) / (
                        smooth_para * (k + 1) + counter_count.sum(axis=0))

                    # Write to files for future use.
                    save_posterior_prob(count, counter_count,
                                        pos_prob_positive, pos_prob_negative,
                                        k, model_dir)

    except tf.errors.OutOfRangeError:
        logging.info('Done training -- one epoch limit reached.')
    finally:
        # When done, ask the threads to stop.
        coord.request_stop()

    # Wait for threads to finish.
    coord.join(threads)
    sess.close()

    # Save models parameters after passing all examples.
    for k, count, counter_count in zip(k_list, count_list, counter_count_list):
        # Compute posterior probabilities.
        pos_prob_positive = (smooth_para + count) / (smooth_para * (k + 1) +
                                                     count.sum(axis=0))
        pos_prob_negative = (smooth_para + counter_count) / (
            smooth_para * (k + 1) + counter_count.sum(axis=0))

        # Write to files for future use.
        save_posterior_prob(count, counter_count, pos_prob_positive,
                            pos_prob_negative, k, model_dir)

    # Output the best k in validate set.
    if opt_hyper_para:
        validate_data_pattern = FLAGS.validate_data_pattern

        validate_data_pipeline = DataPipeline(
            reader=reader,
            data_pattern=validate_data_pattern,
            batch_size=batch_size,
            num_readers=num_readers)
        _, validate_data, validate_labels, _ = random_sample(
            0.05,
            mask=(False, True, True, False),
            data_pipeline=validate_data_pipeline)
        best_k = None
        best_validate_gap = np.NINF
        for k in k_list:
            pred_obj = Predict(train_data_pipeline, model_dir, k=k)
            num_validate_videos = validate_data.shape[0]
            split_indices = np.linspace(
                0,
                num_validate_videos + 1,
                num=max(num_validate_videos // batch_size + 1, 2),
                dtype=np.int32)

            validate_gaps = []
            for i in range(len(split_indices) - 1):
                start_ind = split_indices[i]
                end_ind = split_indices[i + 1]
                ith_predictions = pred_obj.make_batch_predictions(
                    None, validate_data[start_ind:end_ind])
                ith_validate_gap = gap_fn(ith_predictions,
                                          validate_labels[start_ind:end_ind])

                validate_gaps.append(ith_validate_gap * (end_ind - start_ind))

            validate_gap = sum(validate_gaps) / num_validate_videos

            logging.info('k: {}, validate gap: {}'.format(k, validate_gap))
            if validate_gap > best_validate_gap:
                best_k = k
                best_validate_gap = validate_gap
        print('Best k: {}, with validate gap {}'.format(
            best_k, best_validate_gap))
예제 #9
0
inputlabels = list(map(readers.rmtxt, map(os.path.basename, args.inputfile)))

# if not len(inputlabels)==len(args.inputfile):
#     raise ValueError("Number of labels must match number of inputfiles.")

embeddings_list = []
labels = []
if args.onsimplex:
    embeddings_list_simplex = []

bool_vocabulary = args.vocabulary

words_set = None
if bool_vocabulary:
    reader = readers.get_reader(args.inputfile[0])
    (dictionary_size, dictionary, reversed_dictionary) = \
            reader.read_dictionary(args.inputfile[0])
    words_set = set(dictionary.keys())

for inputname, inputlabel in zip(args.inputfile, inputlabels):
    print("processing %s\n" % inputname)
    reader = readers.get_reader(inputname)
    dictionary_size, dictionary, reversed_dictionary, u_embeddings, v_embeddings = \
        reader.read_embeddings(inputname, vecsize, consideronlyfirstvec, words_set=words_set)

    outputbasename = readers.rmtxt(os.path.basename(inputname))
    outputpcafolder, outputdistsfolder, outputdistribfolder = make_folders(
        inputlabel)

    outputdistsbasename = outputdistsfolder + '/' + outputbasename
예제 #10
0
def main(unused_argv):
    """
    Train the rbf network.
    """
    logging.set_verbosity(logging.INFO)

    start_new_model = FLAGS.start_new_model
    output_dir = FLAGS.output_dir

    # The ratio of examples to sample as centers (prototypes).
    num_centers_ratio = FLAGS.num_centers_ratio
    model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes
    reader = get_reader(model_type, feature_names, feature_sizes)
    train_data_pattern = FLAGS.train_data_pattern
    validate_data_pattern = FLAGS.validate_data_pattern
    batch_size = FLAGS.batch_size
    num_readers = FLAGS.num_readers

    # distance metric, cosine or euclidean.
    dist_metric = FLAGS.dist_metric
    init_with_linear_clf = FLAGS.init_with_linear_clf

    init_learning_rate = FLAGS.init_learning_rate
    decay_steps = FLAGS.decay_steps
    decay_rate = FLAGS.decay_rate
    train_epochs = FLAGS.train_epochs
    l1_reg_rate = FLAGS.l1_reg_rate
    l2_reg_rate = FLAGS.l2_reg_rate

    # ....Start rbf network...
    logging.info('Entering rbf network...')
    # Validate set is not stored in graph or meta data. Re-create it any way.
    # Sample validate set for logistic regression early stopping.
    validate_data_pipeline = DataPipeline(reader=reader,
                                          data_pattern=validate_data_pattern,
                                          batch_size=batch_size,
                                          num_readers=num_readers)

    if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')):
        with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f:
            validate_data = pickle.load(f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f:
            validate_labels = pickle.load(f)
    else:
        # Sample validate set.
        _, validate_data, validate_labels, _ = random_sample(
            0.05,
            mask=(False, True, True, False),
            data_pipeline=validate_data_pipeline,
            name_scope='sample_validate')
        with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f:
            pickle.dump(validate_data, f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f:
            pickle.dump(validate_labels, f)

    # DataPipeline consists of reader, batch size, no. of readers and data pattern.
    train_data_pipeline = DataPipeline(reader=reader,
                                       data_pattern=train_data_pattern,
                                       batch_size=batch_size,
                                       num_readers=num_readers)

    # If start a new model or output dir does not exist, truly start a new model.
    start_new_model = start_new_model or (not tf.gfile.Exists(output_dir))

    if start_new_model:
        # PHASE ONE - selecting prototypes c, computing scaling factors sigma.
        # num_centers = FLAGS.num_centers
        # num_centers_ratio = float(num_centers) / NUM_TRAIN_EXAMPLES

        # metric is euclidean or cosine. If cosine, alpha=1.0, otherwise can be less than 1.0.
        if 'cosine' == dist_metric:
            # 200 will lead to decreasing drastically and increasing slowly.
            alpha = 1.0
        else:
            alpha = 1.0
        centers, sigmas = initialize(num_centers_ratio,
                                     data_pipeline=train_data_pipeline,
                                     method='kmeans',
                                     metric=dist_metric,
                                     scaling_method=4,
                                     alpha=alpha)

        # PHASE TWO - computing linear regression weights and biases.
        num_centers = centers.shape[0]
        # Compute mean and variance after data transform.
        tr_data_fn = rbf_transform
        tr_data_paras = {
            'centers': centers,
            'sigmas': sigmas,
            'metric': dist_metric,
            'reshape': True,
            'size': num_centers
        }
        """
        # Include standard scale to rbf transform.
        tr_data_mean, tr_data_var = compute_data_mean_var(train_data_pipeline,
                                                          tr_data_fn=tr_data_fn,
                                                          tr_data_paras=tr_data_paras)
        logging.debug('tr_data_mean: {}\ntr_data_var: {}'.format(tr_data_mean, tr_data_var))
        tr_data_paras.update({'mean': tr_data_mean, 'variance': tr_data_var})
        """
        if init_with_linear_clf:
            # Call linear classification to get a good initial values of weights and biases.
            linear_clf = LinearClassifier(
                logdir=path_join(output_dir, 'linear_classifier'))
            linear_clf.fit(data_pipeline=train_data_pipeline,
                           tr_data_fn=tr_data_fn,
                           tr_data_paras=tr_data_paras,
                           l2_regs=[
                               0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
                               1.0, 10.0, 100.0, 1000.0
                           ],
                           validate_set=(validate_data, validate_labels),
                           line_search=True)
            linear_clf_weights, linear_clf_biases = linear_clf.weights, linear_clf.biases
        else:
            linear_clf_weights, linear_clf_biases = None, None

        # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers.
        try:
            # Load sum_labels in training set, numpy float format to compute pos_weights.
            train_sum_labels = load_sum_labels()
            # num_neg / num_pos, assuming neg_weights === 1.0.
            pos_weights = np.sqrt(
                float(NUM_TRAIN_EXAMPLES) / train_sum_labels - 1.0)
            logging.info(
                'Computing pos_weights based on sum_labels in train set successfully.'
            )
        except IOError:
            logging.error('Cannot load train sum_labels. Use default value.')
            pos_weights = None
        finally:
            pos_weights = None
    else:
        linear_clf_weights, linear_clf_biases = None, None
        tr_data_fn, tr_data_paras = None, None
        pos_weights = None

    # PHASE THREE - fine tuning prototypes c, scaling factors sigma and weights and biases.
    log_reg_clf = LogisticRegression(logdir=path_join(output_dir, 'log_reg'))
    log_reg_clf.fit(train_data_pipeline=train_data_pipeline,
                    start_new_model=start_new_model,
                    tr_data_fn=tr_data_fn,
                    tr_data_paras=tr_data_paras,
                    validate_set=(validate_data, validate_labels),
                    validate_fn=gap_fn,
                    init_learning_rate=init_learning_rate,
                    decay_steps=decay_steps,
                    decay_rate=decay_rate,
                    epochs=train_epochs,
                    l1_reg_rate=l1_reg_rate,
                    l2_reg_rate=l2_reg_rate,
                    pos_weights=pos_weights,
                    initial_weights=linear_clf_weights,
                    initial_biases=linear_clf_biases)

    # ....Exit rbf network...
    logging.info('Exit rbf network.')
예제 #11
0
def main(unused_argv):
    logging.set_verbosity(logging.INFO)

    start_new_model = FLAGS.start_new_model
    output_dir = FLAGS.output_dir

    init_learning_rate = FLAGS.init_learning_rate
    decay_steps = FLAGS.decay_steps
    decay_rate = FLAGS.decay_rate
    l1_reg_rate = FLAGS.l1_reg_rate
    l2_reg_rate = FLAGS.l2_reg_rate
    is_bootstrap = FLAGS.is_bootstrap
    train_epochs = FLAGS.train_epochs

    model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes
    reader = get_reader(model_type, feature_names, feature_sizes)
    train_data_pattern = FLAGS.train_data_pattern
    validate_data_pattern = FLAGS.validate_data_pattern
    batch_size = FLAGS.batch_size
    num_readers = FLAGS.num_readers

    if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')):
        with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f:
            validate_data = pickle.load(f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f:
            validate_labels = pickle.load(f)
    else:
        # Increase num_readers.
        validate_data_pipeline = DataPipeline(
            reader=reader,
            data_pattern=validate_data_pattern,
            batch_size=batch_size,
            num_readers=num_readers)

        # Sample validate set.
        _, validate_data, validate_labels, _ = random_sample(
            0.05,
            mask=(False, True, True, False),
            data_pipeline=validate_data_pipeline,
            name_scope='sample_validate')
        with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f:
            pickle.dump(validate_data, f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f:
            pickle.dump(validate_labels, f)

    train_data_pipeline = DataPipeline(reader=reader,
                                       data_pattern=train_data_pattern,
                                       batch_size=batch_size,
                                       num_readers=num_readers)

    model_save_path = path_join(output_dir, 'mlp_fuse')
    if start_new_model and tf.gfile.Exists(model_save_path):
        logging.info('Starting a new model...')
        # Start new model, delete existing checkpoints.
        try:
            tf.gfile.DeleteRecursively(model_save_path)
        except tf.errors.OpError:
            logging.error('Failed to delete dir {}.'.format(model_save_path))
        else:
            logging.info(
                'Succeeded to delete train dir {}.'.format(model_save_path))

    # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers.
    try:
        # Load sum_labels in training set, numpy float format to compute pos_weights.
        train_sum_labels = load_sum_labels()
        # num_neg / num_pos, assuming neg_weights === 1.0.
        pos_weights = np.sqrt(
            float(NUM_TRAIN_EXAMPLES) / train_sum_labels - 1.0)
        logging.info(
            'Computing pos_weights based on sum_labels in train set successfully.'
        )
    except IOError:
        logging.error('Cannot load train sum_labels. Use default value.')
        pos_weights = None
    finally:
        logging.warn('Not to use positive weights.')
        pos_weights = None

    train(train_data_pipeline,
          epochs=train_epochs,
          pos_weights=pos_weights,
          l1_reg_rate=l1_reg_rate,
          l2_reg_rate=l2_reg_rate,
          init_learning_rate=init_learning_rate,
          bootstrap=is_bootstrap,
          validate_set=(validate_data, validate_labels),
          validate_fn=gap_fn,
          logdir=model_save_path)
outputfolder = args.outputfolder
if outputfolder.endswith('/'):
    outputfolder = outputfolder[:-1]

outputdatafolder = outputfolder + '/' + inputlabel
os.makedirs(outputdatafolder, exist_ok=True)

tolerance = args.tolerance
tolstr = ''
if tolerance:
    tolstr = '-tol'

analogiesoutputname = outputdatafolder + '/analogies' + tolstr + '.txt'
analogieslogger = init_logger(analogiesoutputname)

reader = readers.get_reader("glove")
words_set = None
if args.vocabulary:
    (dictionary_size, dictionary, reversed_dictionary) = \
        reader.read_dictionary(args.vocabulary)
    words_set = set(dictionary.keys())

# consideronlyfirstvec=None
# if args.subcommand=='linear':
consideronlyfirstvec = True
# elif args.subcommand=='sphere':
#     consideronlyfirstvec=False

# u_biases and v_biases are not returned at the moment since we do not know what is the use of them we might have in evaluating word analogies
dictionary_size, dictionary, reversed_dictionary, u_embeddings, v_embeddings = \
    reader.read_embeddings(args.inputfile, args.vecsize, consideronlyfirstvec, words_set=words_set)