if shuffle: idxs = random.sample(range(len(X)), len(X)) for i, idx in enumerate(idxs): tmpx, tmpy = X[idx], y[idx] X[idx], y[idx] = X[i], y[i] X[i], y[i] = tmpx, tmpy dev_X, dev_y = X[-1 * val_size:], y[-1 * val_size:] X, y = X[:-1 * val_size], y[:-1 * val_size] return X, y, dev_X, dev_y # Load data trainset = resources.read_relations( "conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-zh-01-08-2016-train/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) devset = resources.read_relations( "conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-zh-01-08-2016-dev/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) testset = resources.read_relations( "conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-zh-01-08-2016-test/", ignore_types=["Explicit", "AltLex"]) """ trainset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-train/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) devset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-dev/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) testset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-test/", ignore_types=["Explicit", "AltLex"]) """ max_len = 256 # Maximum input sequence length
idxs = random.sample(range(len(X)), len(X)) for i, idx in enumerate(idxs): tmpx, tmpy = X[idx], y[idx] X[idx], y[idx] = X[i], y[i] X[i], y[i] = tmpx, tmpy dev_X, dev_y = X[-1 * val_size:], y[-1 * val_size:] X, y = X[:-1 * val_size], y[:-1 * val_size] return X, y, dev_X, dev_y ## Load data # Chinese trainset = resources.read_relations("conll16st-zh-01-08-2016-train/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) devset = resources.read_relations("conll16st-zh-01-08-2016-dev/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) testset = resources.read_relations("conll16st-zh-01-08-2016-test/", ignore_types=["Explicit", "AltLex"]) blindset = resources.read_relations("conll16st-zh-04-27-2016-blind-test/", ignore_types=["Explicit", "AltLex"]) # English #trainset = resources.read_relations("conll16st-en-03-29-16-train/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) #devset = resources.read_relations("conll16st-en-03-29-16-dev/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) #testset = resources.read_relations("conll16st-en-03-29-16-test/", ignore_types=["Explicit", "AltLex"]) #blindset = resources.read_relations("conll15st-en-03-29-16-blind-test/", ignore_types=["Explicit", "AltLex"]) max_len = 256 # Maximum input sequence length
X, y = np.concatenate((dev_X,X)), np.concatenate((dev_y,y)) if shuffle: idxs = random.sample(range(len(X)), len(X)) for i, idx in enumerate(idxs): tmpx, tmpy = X[idx], y[idx] X[idx], y[idx] = X[i], y[i] X[i], y[i] = tmpx, tmpy dev_X, dev_y = X[-1*val_size:], y[-1*val_size:] X, y = X[:-1*val_size], y[:-1*val_size] return X, y, dev_X, dev_y ## Load data trainset = resources.read_relations("data/en.test/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) devset = resources.read_relations("data/en.test/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) testset = resources.read_relations("data/en.test/", ignore_types=["Explicit", "AltLex"]) """ trainset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-train/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) devset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-dev/", ignore_types=["Explicit", "AltLex"], partial_sampling=True) testset = resources.read_relations("conll16st-en-zh-dev-train-test_LDC2016E50/conll16st-en-03-29-16-test/", ignore_types=["Explicit", "AltLex"]) """ max_len = 256 # Maximum input sequence length # Set maximum input sequence length as percentile of actual lengths #max_perc = 98.0 #max_len = int(np.percentile([len(smpl[0]) for smpl in trainset+devset+testset], max_perc)) print ("Maximum sequence length", max_len)