示例#1
0
def GENERATE_RANDOM_EXPANDERS(K, size_H, EPSILON, samples):

    NAME = '[RANDOM' + str(K) + ']'

    print NAME + " Generating " + str(
        samples) + " H (adjacency list matrices) of size " + str(
            size_H) + " x " + str(K) + " ... "
    print "\n"

    eigenvalue = 0

    for sampling in range(samples):
        print NAME + " ##  " + str(sampling) + "  //  " + str(
            samples) + "  ## "

        H = generate_expander(K, size_H)
        eigenvalue_aux = helpers.generate_eigenvalue(H, size_H, K, EPSILON,
                                                     NAME)

        eigenvalue += eigenvalue_aux

    eigenvalue = eigenvalue / samples

    print NAME + " Calculated average of second highest eigenvalue for " + str(
        samples) + " matrices H."

    helpers.write_result(NAME, size_H, K, eigenvalue)
    helpers.cleanup(".aux")
示例#2
0
def GENERATE_ANGLUIN_EXPANDERS(size, A_indices, n, EPSILON):
    size_H = 2 * size

    print NAME + " Generating H (adjacency list matrix) of size " + str(
        size_H) + " x " + str(K) + " ... "

    H = numpy.empty(
        shape=(size_H, K),
        dtype=numpy.int32)  # Generate H, empty adjacency list matrix

    for row in A_indices:
        for element_index in row:  # Get the tuple index from the matrix of indices (A)
            x0 = element_index / n  # Grab first value
            y0 = element_index % n  # Grab second value

            i = element_index  # Grab the index of the (x0, y0) element

            # connect to (x, y) in B
            x = x0
            y = y0
            j = (x * n + y % n) + size  # add the shift in the H indexing

            H[i][0] = j  # node with index i is connected to node with index j
            H[j][0] = i  # vice-versa

            # connect to (x + y, y) in B
            x = (x0 + y0) % n
            y = y0
            j = (x * n + y % n) + size

            H[i][1] = j
            H[j][1] = i

            # connect to (y + 1, -x) in B
            x = (y0 + 1) % n
            y = (-x0) % n
            j = (x * n + y % n) + size

            H[i][2] = j
            H[j][2] = i

    print NAME + " Generated adjacency list matrix H."

    print NAME + " Calculating second highest eigenvalue of H ... "

    eigenvalue = helpers.generate_eigenvalue(H, size_H, K, EPSILON, NAME)

    print NAME + " Calculated second highest eigenvalue of H."

    helpers.write_result(NAME, size_H, K, eigenvalue)
    helpers.cleanup(".aux")
示例#3
0
def GENERATE_ANGLUIN_EXPANDERS(size, A_indices, n, EPSILON):
  size_H = 2 * size

  print NAME + " Generating H (adjacency list matrix) of size " + str(size_H) + " x " + str(K) + " ... "

  H = numpy.empty(shape=(size_H, K), dtype=numpy.int32)   # Generate H, empty adjacency list matrix

  for row in A_indices:
    for element_index in row:   # Get the tuple index from the matrix of indices (A)
      x0 = element_index / n   # Grab first value
      y0 = element_index % n   # Grab second value

      i = element_index       # Grab the index of the (x0, y0) element

      # connect to (x, y) in B
      x = x0
      y = y0
      j = (x * n + y % n) + size   # add the shift in the H indexing       

      H[i][0] = j      # node with index i is connected to node with index j
      H[j][0] = i      # vice-versa

      # connect to (x + y, y) in B
      x = (x0 + y0) % n
      y = y0
      j = (x * n + y % n) + size

      H[i][1] = j
      H[j][1] = i

      # connect to (y + 1, -x) in B
      x = (y0 + 1) % n
      y = (-x0) % n
      j = (x * n + y % n) + size

      H[i][2] = j
      H[j][2] = i


  print NAME + " Generated adjacency list matrix H."

  print NAME + " Calculating second highest eigenvalue of H ... "

  eigenvalue = helpers.generate_eigenvalue(H, size_H, K, EPSILON, NAME)

  print NAME + " Calculated second highest eigenvalue of H."

  helpers.write_result(NAME, size_H, K, eigenvalue) 
  helpers.cleanup(".aux") 
示例#4
0
def GENERATE_RANDOM_EXPANDERS(K, size_H, EPSILON, samples):

  NAME = '[RANDOM' + str(K) + ']'

  print NAME + " Generating " + str(samples) + " H (adjacency list matrices) of size " + str(size_H) + " x " + str(K) + " ... "
  print "\n"

  eigenvalue = 0

  for sampling in range(samples):
    print NAME + " ##  " + str(sampling) + "  //  " + str(samples) + "  ## "

    H = generate_expander(K, size_H)
    eigenvalue_aux = helpers.generate_eigenvalue(H, size_H, K, EPSILON, NAME)

    eigenvalue += eigenvalue_aux

  eigenvalue = eigenvalue / samples

  print NAME + " Calculated average of second highest eigenvalue for " + str(samples) + " matrices H."

  helpers.write_result(NAME, size_H, K, eigenvalue) 
  helpers.cleanup(".aux")
示例#5
0
os.makedirs(os.path.join(args.data_path, args.dataset, args.now))

# copying validation set to new path
val_df = pd.read_csv('{}/{}/val.csv'.format(args.data_path, args.dataset), header=None, names=['text', 'label'])
val_df.to_csv('{}/{}/{}/val.csv'.format(args.data_path, args.dataset, args.now), header=False, index=False)
del val_df

for avg_iter in range(args.num_avg):
    
    print('\nRun {}\n'.format(avg_iter))
    
    # setting datapath to original datasets
    args.datapath = os.path.join(args.data_path, args.dataset)
    
    filename = os.path.join(args.result_path, '{}_{}_{}.csv'.format(args.method, args.dataset, avg_iter))
    helpers.write_result(filename, 'w', ['Train Size', 'loss', 'accuracy', 'total {}'.format(args.method), 'al time', 'train time'], args)
    total = 0
    time_ = 0
    
    for al_iter in range(args.rounds):
    
        # defining text and label fields
        text_field = data.Field(lower=True)#, init_token='<bos>', eos_token='<eos>', tokenize="spacy")
        label_field = data.Field(sequential=False)

        # load data
        print('\nLoading {} data ...\n'.format(args.dataset))
        train_set, val_set, test_set = data_loaders.ds_loader(text_field, label_field, args)
        text_field.build_vocab(train_set)
        label_field.build_vocab(train_set)
        args.train_size, args.val_size, args.test_size = len(train_set), len(val_set), len(test_set) # sizes
示例#6
0
文件: al.py 项目: tinaolivia/lstm_al
def al(init_model, avg_iter, args):

    train_df = pd.read_csv(args.path / 'train.csv',
                           header=None,
                           names=args.names)
    test_df = pd.read_csv(args.path / 'test.csv',
                          header=None,
                          names=args.names)

    print('\nEvaluating initial model ...')
    preds = init_model.validate()

    helpers.write_result(
        args.save_dir /
        '{}_{}_{}.csv'.format(args.dataset, args.method, avg_iter), 'w', [
            'Train Size', 'loss', 'accuracy', 'total {}'.format(args.method),
            'al time', 'train time'
        ], args)
    helpers.write_result(
        args.save_dir /
        '{}_{}_{}.csv'.format(args.dataset, args.method, avg_iter), 'a',
        [len(train_df['text']), preds[0], preds[1].numpy(), 0, 0, 0], args)

    model = init_model

    print('\nStarting active learning loop ...')
    for al_loop in range(args.rounds):

        print('\nRound {}'.format(al_loop))
        start = time.time()
        # selecting new instances to add to train
        if args.method == 'random':
            if args.cluster:
                subset = methods.random_w_clustering(len(test_df['text']),
                                                     test_df['text'], args)
            else:
                subset = methods.random(len(test_df['text']), args)
            total = 0

        elif args.method == 'entropy':
            subset, total = methods.entropy(model, args, df=test_df['text'])

        elif args.method == 'margin':
            subset, total = methods.margin(model, args, df=test_df['text'])

        elif args.method == 'variation_ratio':
            subset, total = methods.variation_ratio(model,
                                                    args,
                                                    df=test_df['text'])

        elif args.method == 'dropout_variability':
            subset, total = methods.dropout_variability(model,
                                                        args,
                                                        df=test_df['text'])

        elif args.method == 'dropout_entropy':
            subset, total = methods.dropout_entropy(model,
                                                    args,
                                                    df=test_df['text'])

        elif args.method == 'dropout_margin':
            subset, total = methods.dropout_margin(model,
                                                   args,
                                                   df=test_df['text'])

        elif args.method == 'dropout_variation_ratio':
            subset, model = methods.dropout_variation(model,
                                                      args,
                                                      df=test_df['text'])

        end = time.time()
        al_time = end - start

        print('Round {}: {} insances selected according to {}'.format(
            al_loop, len(subset), args.method))

        # updating datasets
        print('\nUpdating datasets ...')
        #helpers.update_datasets(train, test, subset, args)
        helpers.update_datasets(train_df, test_df, subset, args)

        # reload data as DataBunch an retrain the model
        print('\nReloading data ...')
        train_df = pd.read_csv(args.path / args.now / 'train_up.csv',
                               header=None,
                               names=args.names)
        valid_df = pd.read_csv(args.path / args.now / 'val.csv',
                               header=None,
                               names=args.names)
        test_df = pd.read_csv(args.path / args.now / 'test_up.csv',
                              header=None,
                              names=args.names)
        if args.dropout:
            test_df = helpers.check_batch_size(test_df, len(test_df['text']),
                                               args)
        data_lm = TextLMDataBunch.from_df(args.path / args.now,
                                          train_df=train_df,
                                          valid_df=valid_df,
                                          test_df=test_df,
                                          text_cols=args.cols[0],
                                          label_cols=args.cols[1])
        data_clas = TextClasDataBunch.from_df(args.path / args.now,
                                              train_df=train_df,
                                              valid_df=valid_df,
                                              test_df=test_df,
                                              text_cols=args.cols[0],
                                              label_cols=args.cols[1],
                                              vocab=data_lm.train_ds.vocab,
                                              bs=args.bs)

        print('\nRetraining model ...')
        # fine tuning language model
        start = time.time()
        helpers.language_model(data_lm, args)
        # create a classifier
        model = helpers.classifier(data_clas, args)
        end = time.time()
        train_time = end - start

        print('\nEvaluating ...')
        preds = model.validate()
        helpers.write_result(
            args.save_dir /
            '{}_{}_{}.csv'.format(args.dataset, args.method, avg_iter), 'a', [
                len(train_df), preds[0], preds[1].numpy(), total, al_time,
                train_time
            ], args)