예제 #1
0
def main():
    parser = argparse.ArgumentParser(description="Retrieve petitions from We The People")
    parser.add_argument(
        "-m",
        "--max",
        metavar="INTEGER",
        dest="max",
        type=int,
        default=None,
        help="maximum number of petitions to retrieve",
    )
    parser.add_argument(
        "-s",
        "--start",
        metavar="INTEGER",
        dest="start",
        type=int,
        default=1,
        help="starting page, 20 per page, default is 1",
    )
    args = parser.parse_args()

    if args.max is not None and args.max < 1:
        parser.error("How can I scrape less than one petition? You make no sense! --max must be one or greater.")

    if args.start < 1:
        parser.error("--start must be one or greater.")

    log("Found %i petitions" % (petitions(args.start, args.max)))

    # write log
    scrapelog["end"] = datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
    write(json.dumps(scrapelog, indent=2), "log-wh-" + scrapelog["begin"] + ".json", log_dir())
예제 #2
0
def main():
    parser = argparse.ArgumentParser(description="Retrieve petitions from We The People")
    parser.add_argument("-m", "--max", metavar="INTEGER", dest="max", type=int, default=None,
                        help="maximum pages of petitions to retrieve, default is 10, 100 per page")
    parser.add_argument("-s", "--start", metavar="INTEGER", dest="start", type=int, default=1,
                        help="starting page, 100 per page, default is 1")
    parser.add_argument("-q", "--query", metavar="STRING", dest="query", type=str, default="whitehouse+petition",
                        help="The query for searching twitter for petition links, default is 'whitehouse+petition'")
    args = parser.parse_args()

    if args.max is not None and args.max < 1:
        parser.error("How can I scrape less than one pages of twitter results? You make no sense! --max must be one or greater.")

    if args.start < 1:
        parser.error("--start must be one or greater.")

    if not len(sys.argv) > 1:
        log('Running with default values. Use --h to see options.')

    search(args.query, args.start, args.max)

    #write log
    scrapelog["query"] = args.query
    scrapelog["end"] = datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
    write(json.dumps(scrapelog, indent=2), "log-tw-" + scrapelog["begin"] + ".json", log_dir())
    log("Done. Found total %i petitions" % (len(scrapelog["signatures"])))
예제 #3
0
def main():
    parser = argparse.ArgumentParser(
        description="Retrieve petitions from We The People")
    parser.add_argument("-m",
                        "--max",
                        metavar="MAX",
                        dest="max",
                        type=int,
                        default=None,
                        help="maximum number of petitions to retrieve")
    parser.add_argument("-s",
                        "--start",
                        metavar="START",
                        dest="start",
                        type=int,
                        default=1,
                        help="starting page, 20 per page, default is 1")
    args = parser.parse_args()

    if args.max is not None and args.max < 1:
        parser.error(
            "How can I scrape less than one petition? You make no sense! --max must be one or greater."
        )

    if args.start < 1:
        parser.error("--start must be one or greater.")

    log("Found %i petitions" % (petitions(args.start, args.max)))

    #write log
    scrapelog["end"] = datetime.now().strftime("%Y-%m-%d-%H:%M:%S")
    write(json.dumps(scrapelog, indent=2),
          "log-wh-" + scrapelog["begin"] + ".json", log_dir())
예제 #4
0
def main():
    # loads a lot of default parser values from the 'parser' file
    parser = get_parser()

    # get args from parser as an object
    args = parser.parse_args()
    args.device = 'cuda' if args.cuda else 'cpu'

    # initialize seeds
    utils.init_seed(args.seed)

    # print('loader stuff', args)
    loader = Loader.IncrementalLoader(args, seed=args.seed)
    # print('loader stuff after after', args)
    n_inputs, n_outputs, n_tasks = loader.get_dataset_info()

    # setup logging
    # logging is from 'misc_utils.py' from 'utils' folder
    timestamp = utils.get_date_time() # this line is redundant bcz log_dir already takes care of it
    args.log_dir, args.tf_dir = utils.log_dir(args, timestamp) # stores args into "training_parameters.json"

    # create the model neural net
    model = Model.Net(n_inputs, n_outputs, n_tasks, args, innerlr=args.opt_lr, outerlr=args.alpha_init)
    
    # make model cuda-ized if possible
    model.net.to(args.device)            

    # for all the CL baselines
    result_val_t, result_val_a, result_test_t, result_test_a, spent_time = life_experience(model, loader, args)

    # save results in files or print on terminal
    save_results(args, result_val_t, result_val_a, result_test_t, result_test_a, model, spent_time)
예제 #5
0
def train(train_data, test_data=None):
    features, label_map, \
        train_nodes, valid_nodes, test_nodes, \
        train_adj, train_weight_adj, train_column_adj, \
        test_adj, test_weight_adj, test_column_adj = train_data

    # if isinstance(list(class_map.values())[0], list):
    #     num_classes = len(list(class_map.values())[0])
    # else:
    #     num_classes = len(set(class_map.values()))

    num_classes = label_map.shape[1]
    feats_dim = features.shape[1]

    # 插入0行好像没什么用啊?
    if not features is None:
        # pad with dummy zero vector
        features = np.vstack([features, np.zeros((feats_dim, ))])
    # 不晓得为啥要variable(constant(), trainable=False), 很奇怪
    features_info = tf.Variable(tf.constant(features, dtype=tf.float32),
                                trainable=False)

    #context_pairs = train_data[3] if FLAGS.random_context else None
    placeholders = construct_placeholders(num_classes, feats_dim)
    minibatch = NodeMinibatchIterator(
        placeholders,
        #   features,
        #   id_map,
        #   weight_map,
        label_map,
        #   weight_dict,
        supervised_info=[train_nodes, valid_nodes, test_nodes],
        batch_size=FLAGS.batch_size,
        max_degree=FLAGS.max_degree)

    # 注意!是placeholder, 且是全量的
    # TODO shape 还有数据信息, (, train_adj.shape)
    adj_info_ph = tf.placeholder(tf.int32, shape=train_adj.shape)
    weight_adj_info_ph = tf.placeholder(tf.float32,
                                        shape=train_weight_adj.shape)
    column_adj_info_ph = tf.placeholder(tf.int32, shape=train_column_adj.shape)

    adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info")
    weight_adj_info = tf.Variable(weight_adj_info_ph,
                                  trainable=False,
                                  name='weight_adj_info')
    column_adj_info = tf.Variable(column_adj_info_ph,
                                  trainable=False,
                                  name='column_adj_info')

    # 没有被完全赋值,只是赋值操作
    train_adj_info = tf.assign(adj_info, train_adj)
    val_adj_info = tf.assign(adj_info, test_adj)

    train_weight_adj_info = tf.assign(weight_adj_info, train_weight_adj)
    val_weight_adj_info = tf.assign(weight_adj_info, test_weight_adj)

    train_column_adj_info = tf.assign(column_adj_info, train_column_adj)
    val_column_adj_info = tf.assign(column_adj_info, test_column_adj)

    # 采样
    # TODO  features 数据还是从这里进去了
    # TODO 要拿出来啊啊啊啊啊
    sampler = UniformNeighborSampler(features_info, adj_info, weight_adj_info,
                                     column_adj_info)

    # === build model ===
    if FLAGS.model == 'graphsage_mean':
        # Create model
        sampler = UniformNeighborSampler(adj_info, weight_adj_info,
                                         column_adj_info)
        # 16, 8
        if FLAGS.samples_3 != 0:
            layer_infos = [
                SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2),
                SAGEInfo("node", sampler, FLAGS.samples_3, FLAGS.dim_2)
            ]
        elif FLAGS.samples_2 != 0:
            layer_infos = [
                SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
                SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)
            ]
        else:
            layer_infos = [
                SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1)
            ]

        model = SupervisedGraphsage(num_classes,
                                    placeholders,
                                    features,
                                    adj_info,
                                    minibatch.deg,
                                    layer_infos,
                                    concat=True,
                                    model_size=FLAGS.model_size,
                                    sigmoid_loss=FLAGS.sigmoid,
                                    identity_dim=FLAGS.identity_dim,
                                    logging=True)

    elif FLAGS.model == 'gcn':
        # Create model
        sampler = UniformNeighborSampler(adj_info, weight_adj_info,
                                         column_adj_info)
        layer_infos = [
            SAGEInfo("node", sampler, FLAGS.samples_1, 2 * FLAGS.dim_1),
            SAGEInfo("node", sampler, FLAGS.samples_2, 2 * FLAGS.dim_2)
        ]

        model = SupervisedGraphsage(num_classes,
                                    placeholders,
                                    features,
                                    adj_info,
                                    minibatch.deg,
                                    layer_infos=layer_infos,
                                    aggregator_type="gcn",
                                    model_size=FLAGS.model_size,
                                    concat=False,
                                    sigmoid_loss=FLAGS.sigmoid,
                                    identity_dim=FLAGS.identity_dim,
                                    logging=True)

    elif FLAGS.model == 'geniepath':
        sampler = UniformNeighborSampler(adj_info, weight_adj_info,
                                         column_adj_info)
        layer_infos = [
            SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
            SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)
        ]

        model = SupervisedGraphsage(num_classes,
                                    placeholders,
                                    features,
                                    adj_info,
                                    minibatch.deg,
                                    layer_infos=layer_infos,
                                    aggregator_type="geniepath",
                                    model_size=FLAGS.model_size,
                                    concat=False,
                                    sigmoid_loss=FLAGS.sigmoid,
                                    identity_dim=FLAGS.identity_dim,
                                    logging=True)

    elif FLAGS.model == 'cross':
        # Create model
        # if FLAGS.samples_3 != 0:
        #     layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
        #                    SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2),
        #                    SAGEInfo("node", sampler, FLAGS.samples_3, FLAGS.dim_2)]
        # elif FLAGS.samples_2 != 0:
        #     layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1),
        #                    SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)]
        # else:
        #     layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1)]
        layer_infos = [
            SAGEInfo("node", FLAGS.samples_1, FLAGS.dim_1),
            SAGEInfo("node", FLAGS.samples_2, FLAGS.dim_2)
        ]
        model = SupervisedGraphsage(
            placeholders,
            feats_dim,
            num_classes,
            sampler,
            # features,
            # adj_info, # variable
            # minibatch.deg,
            layer_infos=layer_infos,
            aggregator_type='cross',  # 多了这一句
            concat=True,
            model_size=FLAGS.model_size,
            sigmoid_loss=FLAGS.sigmoid,
            identity_dim=FLAGS.identity_dim,
            logging=True)

    else:
        raise Exception('Error: model name unrecognized.')

    config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)
    config.gpu_options.allow_growth = True
    # config.gpu_options.per_process_gpu_memory_fraction = GPU_MEM_FRACTION
    config.allow_soft_placement = True

    # Initialize session
    sess = tf.Session(config=config)
    # merged = tf.summary.merge_all()
    # summary_writer = tf.summary.FileWriter(log_dir(), sess.graph)

    # Init variables
    sess.run(tf.global_variables_initializer(),
             feed_dict={
                 adj_info_ph: train_adj,
                 weight_adj_info_ph: train_weight_adj,
                 column_adj_info_ph: train_column_adj
             })

    # === Train model ===
    total_steps = 0
    avg_time = 0.0
    epoch_val_costs = []

    for epoch in range(FLAGS.train_epochs):
        minibatch.shuffle()

        iter = 0
        print('\n### Epoch: %04d ###' % (epoch + 1))
        epoch_val_costs.append(0)
        while not minibatch.end():
            # Construct feed dictionary
            feed_dict, labels = minibatch.next_minibatch_feed_dict(
            )  # 每一次都有全量的feet 进来
            feed_dict.update({placeholders['dropout']: FLAGS.dropout})

            # t = time.time()
            outs = sess.run([model.opt_op, model.loss, model.preds],
                            feed_dict=feed_dict)
            # outs = sess.run([merged, model.opt_op, model.loss, model.preds],
            #                 feed_dict=feed_dict)
            # outs = sess.run([merged, model.loss, model.preds],feed_dict=feed_dict)
            # train_cost = outs[2]

            # if iter % FLAGS.validate_iter == 0:
            #     # Validation
            #     # do the assign operation
            #     sess.run([val_adj_info.op, val_weight_adj_info.op, val_column_adj_info.op])

            #     # 如果有设置采样数量的话
            #     if FLAGS.validate_batch_size == -1:
            #         val_cost, val_f1_mic, val_f1_mac,report, duration, _ = incremental_evaluate(
            #             sess, model, minibatch, FLAGS.batch_size)
            #     else:
            #         val_cost, val_f1_mic, val_f1_mac, duration = evaluate(
            #             sess, model, minibatch, FLAGS.validate_batch_size)

            #     sess.run([train_adj_info.op, train_weight_adj_info.op, train_column_adj_info.op])

            #     epoch_val_costs[-1] += val_cost

            # if total_steps % FLAGS.print_every == 0:
            #     summary_writer.add_summary(outs[0], total_steps)

            # Print results
            # avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1)
            # print("train_time=", "{:.5f}".format(avg_time))

            # if total_steps % FLAGS.print_every == 0:
            # train_f1_mic, train_f1_mac = calc_f1(labels, outs[-1])
            # train_accuracy = calc_acc(labels,outs[-1])
            # report = classification_report(labels,outs[-1])
            # print("Iter:", '%04d' % iter,
            #       "train_loss=", "{:.5f}".format(train_cost),
            #       "train_f1_mic=", "{:.5f}".format(train_f1_mic),
            #       "train_f1_mac=", "{:.5f}".format(train_f1_mac),
            #       "val_loss=", "{:.5f}".format(val_cost),
            #       "val_f1_mic=", "{:.5f}".format(val_f1_mic),
            #       "val_f1_mac=", "{:.5f}".format(val_f1_mac),
            #       "time=", "{:.5f}".format(avg_time))
            #print(report)

            iter += 1
            total_steps += 1

            if total_steps > FLAGS.max_total_steps:
                break

        # when each epoch ends
        # show the F1 report
        if epoch % 1 == 0:

            # sess.run([val_adj_info.op, val_weight_adj_info.op, val_column_adj_info.op])
            sess.run([val_adj_info, val_weight_adj_info, val_column_adj_info])

            # val_cost, val_f1_mic, val_f1_mac, report, duration = incremental_evaluate(
            #     sess, model, minibatch, FLAGS.batch_size)
            # area = my_incremental_evaluate(
            #     sess, model, minibatch, FLAGS.batch_size)
            # # precision, recall, thresholds = precision_recall_curve(
            # #     val_labels[:, 1], val_preds[:, 1])
            # # area2 = auc(recall, precision)

            # print("Full validation stats:",
            #       "loss=", "{:.5f}".format(val_cost),
            #       "f1_micro=", "{:.5f}".format(val_f1_mic),
            #       "f1_macro=", "{:.5f}".format(val_f1_mac),
            #       "time=", "{:.5f}".format(duration))

            # print(report)
            # print('AUC',area)

            test_cost, test_f1_mic, test_f1_mac, report, duration = incremental_evaluate(
                sess, model, minibatch, FLAGS.batch_size, test=True)
            area = my_incremental_evaluate(sess,
                                           model,
                                           minibatch,
                                           FLAGS.batch_size,
                                           test=True)
            # precision, recall, thresholds = precision_recall_curve(
            #     test_labels[:, 1], test_preds[:, 1])
            # area2 = auc(recall, precision)

            print("Full Test stats:", "loss=", "{:.5f}".format(test_cost),
                  "f1_micro=", "{:.5f}".format(test_f1_mic), "f1_macro=",
                  "{:.5f}".format(test_f1_mac), "time=",
                  "{:.5f}".format(duration))
            print(report)
            print('AUC', area)

            # once acu > 0.82, save the model
            if area > 0.83:
                model.save(sess)
                print('AUC gotcha! model saved.')

                # np.save('../data/'+FLAGS.model+'aggr'+'_precision',precision)
                # np.save('../data/'+FLAGS.model+'aggr'+'_recall',recall)

        # 应该设置下early stopping
        if total_steps > FLAGS.max_total_steps:
            break

    # model.save(sess)
    print("Optimization Finished!")

    sess.run([val_adj_info.op, val_weight_adj_info.op, val_column_adj_info.op])
    # val_cost, val_f1_mic, val_f1_mac, report, duration, area = incremental_evaluate(
    #     sess, model, minibatch, FLAGS.batch_size)
    # area = my_incremental_evaluate(
    #     sess, model, minibatch, FLAGS.batch_size)
    # precision, recall, thresholds = precision_recall_curve(
    #     val_labels[:, 1], val_preds[:, 1])
    # area = auc(recall, precision)

    # np.save('../data/val_preds.npy', val_preds)
    # np.save('../data/val_labels.npy', val_labels)
    # np.save('../data/val_cost.npy', v_cost)

    # print("Full validation stats:",
    #       "loss=", "{:.5f}".format(val_cost),
    #       "f1_micro=", "{:.5f}".format(val_f1_mic),
    #       "f1_macro=", "{:.5f}".format(val_f1_mac),
    #       "time=", "{:.5f}".format(duration))
    # print(report)
    # print('AUC', area)

    # with open(log_dir() + "val_stats.txt", "w") as fp:
    #     fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f} time={:.5f}".
    #              format(val_cost, val_f1_mic, val_f1_mac, duration))

    test_cost, test_f1_mic, test_f1_mac, report, duration = incremental_evaluate(
        sess, model, minibatch, FLAGS.batch_size, test=True)
    area = my_incremental_evaluate(sess,
                                   model,
                                   minibatch,
                                   FLAGS.batch_size,
                                   test=True)
    # precision, recall, thresholds = precision_recall_curve(
    #     test_labels[:, 1], test_preds[:, 1])
    # area = auc(recall, precision)

    # np.save('../data/test_preds.npy', test_preds)
    # np.save('../data/test_labels.npy', test_labels)
    # np.save('../data/test_cost.npy', t_cost) # prevent from override

    print("Full Test stats:", "loss=", "{:.5f}".format(test_cost), "f1_micro=",
          "{:.5f}".format(test_f1_mic), "f1_macro=",
          "{:.5f}".format(test_f1_mac), "time=", "{:.5f}".format(duration))
    print(report)
    print('AUC:', area)

    with open(log_dir() + "test_stats.txt", "w") as fp:
        fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f}".format(
            test_cost, test_f1_mic, test_f1_mac))
예제 #6
0
    def fit(self, X, y, n_epochs=100, X_valid=None, y_valid=None):
        """훈련 세트에 모델을 훈련시킵니다. X_valid와 y_valid가 주어지면 조기 종료를 적용합니다."""
        self.close_session()
        tf.summary.merge_all()


        # 훈련 세트로부터 n_inputs와 n_outputs를 구합니다.
        n_inputs = X.shape[1]
        self.classes_ = np.unique(y)
        n_outputs = len(self.classes_)

        # 레이블 벡터를 정렬된 클래스 인덱스 벡터로 변환합니다.
        # 0부터 n_outputs - 1까지의 정수를 담고 있게 됩니다.
        # 예를 들어, y가 [8, 8, 9, 5, 7, 6, 6, 6]이면
        # 정렬된 클래스 레이블(self.classes_)은 [5, 6, 7, 8, 9]가 되고
        # 레이블 벡터는 [3, 3, 4, 0, 2, 1, 1, 1]로 변환됩니다.
        self.class_to_index_ = {label: index
                                for index, label in enumerate(self.classes_)}
        y = np.array([self.class_to_index_[label]
                      for label in y], dtype=np.int32)

        self._graph = tf.Graph()
        with self._graph.as_default():
            self._build_graph(n_inputs, n_outputs)
            # 배치 정규화를 위한 추가 연산
            extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        # 조기 종료를 위해
        max_checks_without_progress = 20
        checks_without_progress = 0
        best_loss = np.infty
        best_params = None

        # 훈련

        file_writer = tf.summary.FileWriter(log_dir("board_log"), graph=self._graph)

        self._session = tf.Session(graph=self._graph)
        with self._session.as_default() as sess:
            self._init.run()
            for epoch in range(n_epochs):
                rnd_idx = np.random.permutation(len(X))
                for rnd_indices in np.array_split(rnd_idx, len(X) // self.batch_size):
                    X_batch, y_batch = X[rnd_indices], y[rnd_indices]
                    feed_dict = {self._X: X_batch, self._y: y_batch}
                    if self._training is not None:
                        feed_dict[self._training] = True
                    sess.run(self._training_op, feed_dict=feed_dict)
                    if extra_update_ops:
                        sess.run(extra_update_ops, feed_dict=feed_dict)
                if X_valid is not None and y_valid is not None:
                    loss_val, acc_val, loss_str, acc_str = sess.run([self._loss, self._accuracy, self._loss_str, self._acc_str],
                                                 feed_dict={self._X: X_valid,
                                                            self._y: y_valid})
                    if loss_val < best_loss:
                        best_params = self._get_model_params()
                        best_loss = loss_val
                        checks_without_progress = 0
                    else:
                        checks_without_progress += 1
                    print("{}\t검증 세트 손실: {:.6f}\t최선의 손실: {:.6f}\t정확도: {:.2f}%".format(
                        epoch, loss_val, best_loss, acc_val * 100))
                    file_writer.add_summary(summary=acc_str, global_step=epoch)
                    file_writer.add_summary(summary=loss_str, global_step=epoch)
                    if checks_without_progress > max_checks_without_progress:
                        print("조기 종료!")
                        break

                else:
                    loss_train, acc_train = sess.run([self._loss, self._accuracy],
                                                     feed_dict={self._X: X_batch,
                                                                self._y: y_batch})
                    print("{}\t마지막 훈련 배치 손실: {:.6f}\tAccuracy: {:.2f}%".format(
                        epoch, loss_train, acc_train * 100))
            # 조기 종료를 사용하면 이전의 최상의 모델로 되돌립니다.

            if best_params:
                self._restore_model_params(best_params)


            return self