def packet_sequences_only(X): P = [] # Find maximum packet sequence length maximum = 0 for p in X: if len(p) > maximum: maximum = len(p) # Pad to maximum length, in place log('Padding each sequence to {}'.format(len(p))) for i in xrange(len(X)): # Flatten and pad X[i] = [x for p in X[i] for x in p] + [0] * (2 * (maximum - len(X[i]))) return np.array(X)
def split_training_test(D, Y, train_size, test_size): """Splits distances D and labels Y into training and test sets as explained below. Note: train_size + test_size <= len(D) The test set of size test_size is first sampled. Then, a training set of size train_size is sampled from the remaining examples. This allows to vary the size of the training set, keeping a fixed test set. If train_size + test_size = len(D), the function simply splits (D,Y) into a training and test set using all the examples. """ n = len(D) # NOTE: we need to remove both rows AND columns from # the distance matrix. log('Splitting into training/test set keeping uniform labels') # First get the test set I = range(n) # Indexes Iother, Itest = train_test_split(I, test_size=test_size, stratify=Y, random_state=args.seed) Ytest = Y[Itest] # Need to do this in two steps Dtest = D[Itest, :] Dtest = D[:, Itest] # Now the training set if train_size < len(Iother): log('Reduced training set') # Now sample train_size instances from Iother to create the training set Itrain, _ = train_test_split(Iother, train_size=train_size, stratify=Y[Iother], random_state=args.seed) else: Itrain = Iother log('Training set has size {}'.format(len(Itrain))) Ytrain = Y[Itrain] # Need to do this in two steps Dtrain = D[Itrain, :] Dtrain = Dtrain[:, Itrain] return Dtrain, Ytrain, Dtest, Ytest
def run(traces, outfname): X, Y, W, _, _ = load_dataset(traces) sizes = encode_sizes(X) log('Computing pairwise distances') D = pairwise_levenshtein_distances(sizes) log('Computing subtractions') log('Storing distances into {}'.format(outfname)) data = { 'webpage-id': W, 'label': np.array(Y), 'pairdist': D, } with open(outfname, 'wb') as f: dill.dump(data, f)
action='store_true', help='Compute on packet sequences.', required=False, default=False) parser.add_argument('--out', type=str, help='Distance file (.distances).', required=True) args = parser.parse_args() if not args.sequences: X, Y, W, _, _ = load_features(args.features) else: X, Y, W, _, _ = load_dataset(args.features) X = packet_sequences_only(X) log('Computing pairwise distances') D = pairwise_distances(X) log('Computing subtractions') log('Storing distances into {}'.format(args.out)) data = { 'webpage-id': W, 'label': np.array(Y), 'pairdist': D, } with open(args.out, 'wb') as f: dill.dump(data, f)
default=0) parser.add_argument('--bootstrap', help='Use bootstrap.', action='store_true', default=False) parser.add_argument('--target', help='Target page for 1 vs All setting.', required=False, type=int) parser.add_argument('--out', type=str, help='Results file (.json).', required=True) args = parser.parse_args() log('Loading distances from {}'.format(args.distances)) with open(args.distances, 'rb') as f: data = dill.load(f) D = data['pairdist'] Y = np.array(data['label']) if args.target is not None: log('One-vs-all setting using {} as target'.format(args.target)) log('Reducing the dataset for one-vs-all') D, Y = one_vs_all_setting(D, Y, args.target) log('Seed is {}'.format(args.seed)) # (Maybe) apply bootstrap if args.bootstrap:
help='Percentage (or number) of test instances.', required=True) parser.add_argument('--seed', type=int, help='PRNG seed (default: 0).', required=False, default=0) parser.add_argument('--out', type=str, help='Output file (.json).', required=True) args = parser.parse_args() X, Y, _, Npages, Nloads = load_features(args.features) log('Seed is {}'.format(args.seed)) n = len(X) # Get training/test set size if args.train > 1: train_size = int(args.train) else: train_size = int(args.train * n) if args.test > 1: test_size = int(args.test) else: test_size = int(args.test * n) log('Training set size: {}. Test set size: {}.'.format( train_size, test_size)) if train_size + test_size != n: