def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=5, **kwargs): if patient_ids: patient_paths = [] for pid in patient_ids: patient_paths.append(data_path + '/%s/study/' % pid) else: patient_paths = glob.glob(data_path + '/*/study/') self.pid2sax_slice_paths = defaultdict(list) self.pid2ch2_path, self.pid2ch4_path = {}, {} for p in patient_paths: pid = int(utils.get_patient_id(p)) spaths = sorted(glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1))) if len(spaths) > min_slices: self.pid2sax_slice_paths[pid] = spaths ch2_path = glob.glob(p + '/2ch_*.pkl') self.pid2ch2_path[pid] = ch2_path[0] if ch2_path else None ch4_path = glob.glob(p + '/4ch_*.pkl') self.pid2ch4_path[pid] = ch4_path[0] if ch4_path else None self.patient_ids = self.pid2sax_slice_paths.keys() self.nsamples = len(self.patient_ids) self.id2labels = data.read_labels(labels_path) if labels_path else None self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.batch_size = batch_size self.infinite = infinite self.transformation_params = transform_params self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
def main(args): print(args) if args.seed != -1: torch.manual_seed(args.seed) np.random.seed(args.seed) pattern_specs = OrderedDict(sorted(([int(y) for y in x.split("-")] for x in args.patterns.split("_")), key=lambda t: t[0])) n = args.num_train_instances mlp_hidden_dim = args.mlp_hidden_dim num_mlp_layers = args.num_mlp_layers dev_vocab = vocab_from_text(args.vd) print("Dev vocab size:", len(dev_vocab)) vocab, embeddings, word_dim = \ read_embeddings(args.embedding_file, dev_vocab) num_padding_tokens = max(list(pattern_specs.keys())) - 1 dev_input, dev_text = read_docs(args.vd, vocab, num_padding_tokens=num_padding_tokens) dev_labels = read_labels(args.vl) dev_data = list(zip(dev_input, dev_labels)) if n is not None: dev_data = dev_data[:n] num_classes = len(set(dev_labels)) print("num_classes:", num_classes) semiring = \ MaxPlusSemiring if args.maxplus else ( LogSpaceMaxTimesSemiring if args.maxtimes else ProbSemiring ) if args.use_rnn: rnn = Rnn(word_dim, args.hidden_dim, cell_type=LSTM, gpu=args.gpu) else: rnn = None model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab, semiring, args.bias_scale_param, args.gpu, rnn=rnn, pre_computed_patterns=None) if args.gpu: print("Cuda!") model.to_cuda(model) state_dict = torch.load(args.input_model) else: state_dict = torch.load(args.input_model, map_location=lambda storage, loc: storage) # Loading model model.load_state_dict(state_dict) interpret_documents(model, args.batch_size, dev_data, dev_text, args.ofile, args.max_doc_len) return 0
def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=False, view='sax', data_prep_fun=data.transform_norm_rescale, **kwargs): if patient_ids: self.patient_paths = [] for pid in patient_ids: self.patient_paths.append(data_path + '/%s/study/' % pid) else: self.patient_paths = glob.glob(data_path + '/*/study/') self.slice_paths = [sorted(glob.glob(p + '/%s_*.pkl' % view)) for p in self.patient_paths] self.slice_paths = list(itertools.chain(*self.slice_paths)) self.slicepath2pid = {} for s in self.slice_paths: self.slicepath2pid[s] = int(utils.get_patient_id(s)) self.nsamples = len(self.slice_paths) self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.infinite = infinite self.id2labels = data.read_labels(labels_path) if labels_path else None self.transformation_params = transform_params self.data_prep_fun = data_prep_fun self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
def main(): n = None mlp_hidden_dim = 25 num_mlp_layers = 2 validation_data_file = "./soft_patterns/data/test.data" dev_vocab = vocab_from_text(validation_data_file) print("Dev vocab size:", len(dev_vocab)) embedding_file = "./soft_patterns/glove.6B.50d.txt" vocab, embeddings, word_dim = read_embeddings(embedding_file, dev_vocab) seed = 100 torch.manual_seed(seed) np.random.seed(seed) patterns = "5-50_4-50_3-50_2-50" pattern_specs = OrderedDict( sorted(([int(y) for y in x.split("-")] for x in patterns.split("_")), key=lambda t: t[0])) num_padding_tokens = max(list(pattern_specs.keys())) - 1 dev_input, _ = read_docs(validation_data_file, vocab, num_padding_tokens=num_padding_tokens) validation_label_file = "./soft_patterns/data/test.labels" dev_labels = read_labels(validation_label_file) dev_data = list(zip(dev_input, dev_labels)) num_classes = len(set(dev_labels)) print("num_classes:", num_classes) semiring = Semiring(zeros, ones, torch.add, torch.mul, sigmoid, identity) rnn = None model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab, semiring, 0.1, False, rnn, None, False, 0, False, None, None) input_model = "./soft_patterns/output/model_9.pth" state_dict = torch.load(input_model, map_location=lambda storage, loc: storage) model.load_state_dict(state_dict) test_acc = evaluate_accuracy(model, dev_data, 1, False) print("Test accuracy: {:>8,.3f}%".format(100 * test_acc)) return 0
def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=0, data_prep_fun=data.transform_norm_rescale, **kwargs): if patient_ids: patient_paths = [] for pid in patient_ids: patient_paths.append(data_path + '/%s/study/' % pid) else: patient_paths = glob.glob(data_path + '/*/study/') self.pid2slice_paths = defaultdict(list) nslices = [] for p in patient_paths: pid = int(utils.get_patient_id(p)) spaths = sorted( glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1))) # consider patients only with min_slices if len(spaths) > min_slices: self.pid2slice_paths[pid] = spaths nslices.append(len(spaths)) # take max number of slices self.nslices = int(np.max(nslices)) self.patient_ids = self.pid2slice_paths.keys() self.nsamples = len(self.patient_ids) self.data_path = data_path self.id2labels = data.read_labels(labels_path) if labels_path else None self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.batch_size = batch_size self.infinite = infinite self.transformation_params = transform_params self.data_prep_fun = data_prep_fun self.slice2roi = utils.load_pkl( slice2roi_path) if slice2roi_path else None
def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=5, **kwargs): if patient_ids: patient_paths = [] for pid in patient_ids: patient_paths.append(data_path + '/%s/study/' % pid) else: patient_paths = glob.glob(data_path + '/*/study/') self.pid2sax_slice_paths = defaultdict(list) self.pid2ch2_path, self.pid2ch4_path = {}, {} for p in patient_paths: pid = int(utils.get_patient_id(p)) spaths = sorted( glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1))) if len(spaths) > min_slices: self.pid2sax_slice_paths[pid] = spaths ch2_path = glob.glob(p + '/2ch_*.pkl') self.pid2ch2_path[pid] = ch2_path[0] if ch2_path else None ch4_path = glob.glob(p + '/4ch_*.pkl') self.pid2ch4_path[pid] = ch4_path[0] if ch4_path else None self.patient_ids = self.pid2sax_slice_paths.keys() self.nsamples = len(self.patient_ids) self.id2labels = data.read_labels(labels_path) if labels_path else None self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.batch_size = batch_size self.infinite = infinite self.transformation_params = transform_params self.slice2roi = utils.load_pkl( slice2roi_path) if slice2roi_path else None
def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=False, view='sax', data_prep_fun=data.transform_norm_rescale, **kwargs): if patient_ids: self.patient_paths = [] for pid in patient_ids: self.patient_paths.append(data_path + '/%s/study/' % pid) else: self.patient_paths = glob.glob(data_path + '/*/study/') self.slice_paths = [ sorted(glob.glob(p + '/%s_*.pkl' % view)) for p in self.patient_paths ] self.slice_paths = list(itertools.chain(*self.slice_paths)) self.slicepath2pid = {} for s in self.slice_paths: self.slicepath2pid[s] = int(utils.get_patient_id(s)) self.nsamples = len(self.slice_paths) self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.infinite = infinite self.id2labels = data.read_labels(labels_path) if labels_path else None self.transformation_params = transform_params self.data_prep_fun = data_prep_fun self.slice2roi = utils.load_pkl( slice2roi_path) if slice2roi_path else None
def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None, slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=0, data_prep_fun=data.transform_norm_rescale, **kwargs): if patient_ids: patient_paths = [] for pid in patient_ids: patient_paths.append(data_path + '/%s/study/' % pid) else: patient_paths = glob.glob(data_path + '/*/study/') self.pid2slice_paths = defaultdict(list) nslices = [] for p in patient_paths: pid = int(utils.get_patient_id(p)) spaths = sorted(glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1))) # consider patients only with min_slices if len(spaths) > min_slices: self.pid2slice_paths[pid] = spaths nslices.append(len(spaths)) # take max number of slices self.nslices = int(np.max(nslices)) self.patient_ids = self.pid2slice_paths.keys() self.nsamples = len(self.patient_ids) self.data_path = data_path self.id2labels = data.read_labels(labels_path) if labels_path else None self.batch_size = batch_size self.rng = np.random.RandomState(42) self.full_batch = full_batch self.random = random self.batch_size = batch_size self.infinite = infinite self.transformation_params = transform_params self.data_prep_fun = data_prep_fun self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
def main(args): print(args) n = args.num_train_instances if args.seed != -1: torch.manual_seed(args.seed) np.random.seed(args.seed) dev_vocab = vocab_from_text(args.vd) print("Dev vocab:", len(dev_vocab)) train_vocab = vocab_from_text(args.td) print("Train vocab:", len(train_vocab)) dev_vocab |= train_vocab vocab, embeddings, word_dim = \ read_embeddings(args.embedding_file, dev_vocab) num_padding_tokens = 1 dev_input, dev_text = read_docs(args.vd, vocab, num_padding_tokens=num_padding_tokens) dev_labels = read_labels(args.vl) dev_data = list(zip(dev_input, dev_labels)) np.random.shuffle(dev_data) train_input, _ = read_docs(args.td, vocab, num_padding_tokens=num_padding_tokens) train_labels = read_labels(args.tl) print("training instances:", len(train_input)) num_classes = len(set(train_labels)) # truncate data (to debug faster) train_data = list(zip(train_input, train_labels)) np.random.shuffle(train_data) print("num_classes:", num_classes) if n is not None: train_data = train_data[:n] dev_data = dev_data[:n] dropout = None if args.td is None else args.dropout # TODO: GRU doesn't work yet cell_type = LSTM # GRU if args.gru else LSTM model = AveragingRnnClassifier(args.hidden_dim, args.mlp_hidden_dim, args.num_mlp_layers, num_classes, embeddings, cell_type=cell_type, gpu=args.gpu) if args.gpu: model.to_cuda(model) model_file_prefix = 'model' # Loading model if args.input_model is not None: state_dict = torch.load(args.input_model) model.load_state_dict(state_dict) model_file_prefix = 'model_retrained' model_save_dir = args.model_save_dir if model_save_dir is not None: if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) print("Training with", model_file_prefix) train(train_data, dev_data, model, num_classes, model_save_dir, args.num_iterations, model_file_prefix, args.learning_rate, args.batch_size, args.scheduler, gpu=args.gpu, clip=args.clip, debug=args.debug, dropout=dropout, word_dropout=args.word_dropout, patience=args.patience)
def main(args): print(args) pattern_specs = OrderedDict( sorted( ([int(y) for y in x.split("-")] for x in args.patterns.split("_")), key=lambda t: t[0])) pre_computed_patterns = None if args.pre_computed_patterns is not None: pre_computed_patterns = read_patterns(args.pre_computed_patterns, pattern_specs) pattern_specs = OrderedDict( sorted(pattern_specs.items(), key=lambda t: t[0])) n = args.num_train_instances mlp_hidden_dim = args.mlp_hidden_dim num_mlp_layers = args.num_mlp_layers if args.seed != -1: torch.manual_seed(args.seed) np.random.seed(args.seed) dev_vocab = vocab_from_text(args.vd) print("Dev vocab size:", len(dev_vocab)) train_vocab = vocab_from_text(args.td) print("Train vocab size:", len(train_vocab)) dev_vocab |= train_vocab vocab, embeddings, word_dim = \ read_embeddings(args.embedding_file, dev_vocab) num_padding_tokens = max(list(pattern_specs.keys())) - 1 dev_input, _ = read_docs(args.vd, vocab, num_padding_tokens=num_padding_tokens) dev_labels = read_labels(args.vl) dev_data = list(zip(dev_input, dev_labels)) np.random.shuffle(dev_data) num_iterations = args.num_iterations train_input, _ = read_docs(args.td, vocab, num_padding_tokens=num_padding_tokens) train_labels = read_labels(args.tl) print("training instances:", len(train_input)) num_classes = len(set(train_labels)) # truncate data (to debug faster) train_data = list(zip(train_input, train_labels)) np.random.shuffle(train_data) print("num_classes:", num_classes) if n is not None: train_data = train_data[:n] dev_data = dev_data[:n] if args.use_rnn: rnn = Rnn(word_dim, args.hidden_dim, cell_type=LSTM, gpu=args.gpu) else: rnn = None semiring = \ MaxPlusSemiring if args.maxplus else ( LogSpaceMaxTimesSemiring if args.maxtimes else ProbSemiring ) model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab, semiring, args.bias_scale_param, args.gpu, rnn, pre_computed_patterns, args.no_sl, args.shared_sl, args.no_eps, args.eps_scale, args.self_loop_scale) if args.gpu: model.to_cuda(model) model_file_prefix = 'model' # Loading model if args.input_model is not None: state_dict = torch.load(args.input_model) model.load_state_dict(state_dict) model_file_prefix = 'model_retrained' model_save_dir = args.model_save_dir if model_save_dir is not None: if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) print("Training with", model_file_prefix) train(train_data, dev_data, model, num_classes, model_save_dir, num_iterations, model_file_prefix, args.learning_rate, args.batch_size, args.scheduler, args.gpu, args.clip, args.max_doc_len, args.debug, args.dropout, args.word_dropout, args.patience) return 0
from sklearn.metrics import confusion_matrix # project functions from data import setup, read_labels, read_samples from preprocessor import purge_HTML, html2txt, count_links, avg_sentence_len # Setup setup(dataDir) nTrain = 700 nTest = 500 # Build dataset labels = read_labels() pos = [k for k, v in labels.items() if v] neg = [k for k, v in labels.items() if not v] random.shuffle(pos) random.shuffle(neg) balanced_labels = {k: True for k in pos[:nTrain]} balanced_labels.update({k: False for k in neg[:nTest]}) trainId, testId = cross_validation.train_test_split( np.array(list(balanced_labels.keys())), train_size=nTrain, test_size=nTest, random_state=10) trainRaw = read_samples(trainId) testRaw = read_samples(testId) trainY = [labels[id] for id in trainId]
def main(args): with open(args.work_dir + "/train.data", encoding="ISO-8859-1") as ifh: wordcount = Counter(ifh.read().split()) sum = np.sum(list(wordcount.values())) # print(sum) wordcount = {k: float(wordcount[k]) / int(sum) for k in wordcount.keys()} words = { k: Word(k, wordcount[k], args.fh, args.fc) for k in wordcount.keys() } patterns = dict() with open(args.work_dir + "/train.data", encoding='ISO-8859-1') as input_file: train_docs = [line.rstrip().split() for line in input_file] with open(args.work_dir + "/dev.data", encoding='ISO-8859-1') as input_file: dev_docs = [line.rstrip().split() for line in input_file] with open(args.work_dir + "/test.data", encoding='ISO-8859-1') as input_file: test_docs = [line.rstrip().split() for line in input_file] train_labels = read_labels(args.work_dir + "/train.labels") dev_labels = read_labels(args.work_dir + "/dev.labels") test_labels = read_labels(args.work_dir + "/test.labels") for doc in train_docs: add_patterns(doc, words, patterns, args.max_pattern_len, args.use_CW_tokens, args.min_pattern_length) # sys.exit(-1) if args.min_pattern_frequency < 1: thr = args.min_pattern_frequency * len(train_docs) else: thr = args.min_pattern_frequency print("Read", len(patterns), "patterns") patterns = {k: patterns[k] for k in patterns.keys() if patterns[k] >= thr} s = 0 for p in patterns.keys(): p.set_freq(patterns[p]) s += patterns[p] pattern_keys = list(patterns.keys()) print("Read", len(patterns), "patterns", s) trie = build_trie(pattern_keys) # print(trie) # sys.exit(-1) # print([x.__str__() for x in patterns if x.size() >= 3]) train_features = lil_matrix((len(train_docs), len(patterns)), dtype=np.int8) dev_features = lil_matrix((len(dev_docs), len(patterns))) test_features = lil_matrix((len(test_docs), len(patterns))) for (i, doc) in enumerate(train_docs): add_patterns(doc, words, patterns, args.max_pattern_len, args.use_CW_tokens, args.min_pattern_length, trie, train_features, i) for (i, doc) in enumerate(dev_docs): add_patterns(doc, words, patterns, args.max_pattern_len, args.use_CW_tokens, args.min_pattern_length, trie, dev_features, i) for (i, doc) in enumerate(test_docs): add_patterns(doc, words, patterns, args.max_pattern_len, args.use_CW_tokens, args.min_pattern_length, trie, test_features, i) # print([x.__str__() for x in patterns.keys()]) # print("df",dev_features) # print("tf", train_features) clf = train(train_features, train_labels, dev_features, dev_labels) gen_salient_patterns(train_features, clf, pattern_keys, args.n_salient_features) if args.model_ofile is not None: print("Saving best model to", args.model_ofile) pickle.dump(clf, open(args.model_ofile, 'wb')) test_predicted_labels = clf.predict(test_features) test_acc = evaluate(test_predicted_labels, test_labels) print("Test accuracy: {}".format(test_acc)) return 0
def main(args): print(args) n = args.num_train_instances mlp_hidden_dim = args.mlp_hidden_dim num_mlp_layers = args.num_mlp_layers dev_vocab = vocab_from_text(args.vd) print("Dev vocab size:", len(dev_vocab)) vocab, embeddings, word_dim = \ read_embeddings(args.embedding_file, dev_vocab) if args.seed != -1: torch.manual_seed(args.seed) np.random.seed(args.seed) if args.dan or args.bilstm: num_padding_tokens = 1 elif args.cnn: num_padding_tokens = args.window_size - 1 else: pattern_specs = OrderedDict(sorted(([int(y) for y in x.split("-")] for x in args.patterns.split("_")), key=lambda t: t[0])) num_padding_tokens = max(list(pattern_specs.keys())) - 1 dev_input, dev_text = read_docs(args.vd, vocab, num_padding_tokens=num_padding_tokens) dev_labels = read_labels(args.vl) dev_data = list(zip(dev_input, dev_labels)) if n is not None: dev_data = dev_data[:n] num_classes = len(set(dev_labels)) print("num_classes:", num_classes) if args.dan: model = DanClassifier(mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, args.gpu) elif args.bilstm: cell_type = LSTM model = AveragingRnnClassifier(args.hidden_dim, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, cell_type=cell_type, gpu=args.gpu) elif args.cnn: model = PooledCnnClassifier(args.window_size, args.num_cnn_layers, args.cnn_hidden_dim, num_mlp_layers, mlp_hidden_dim, num_classes, embeddings, pooling=max_pool_seq, gpu=args.gpu) else: semiring = \ MaxPlusSemiring if args.maxplus else ( LogSpaceMaxTimesSemiring if args.maxtimes else ProbSemiring ) if args.use_rnn: rnn = Rnn(word_dim, args.hidden_dim, cell_type=LSTM, gpu=args.gpu) else: rnn = None model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab, semiring, args.bias_scale_param, args.gpu, rnn, None, args.no_sl, args.shared_sl, args.no_eps, args.eps_scale, args.self_loop_scale) if args.gpu: state_dict = torch.load(args.input_model) else: state_dict = torch.load(args.input_model, map_location=lambda storage, loc: storage) model.load_state_dict(state_dict) if args.gpu: model.to_cuda(model) test_acc = evaluate_accuracy(model, dev_data, args.batch_size, args.gpu) print("Test accuracy: {:>8,.3f}%".format(100*test_acc)) return 0
# This is the network definition proposed in the paper network = TextureNet(n_classes=2) # Loss function - Softmax function is included cross_entropy = nn.CrossEntropyLoss() # Optimizer to control step size in gradient descent optimizer = torch.optim.Adam(network.parameters()) # Transfer model to gpu if USE_GPU and torch.cuda.is_available(): network = network.cuda() # Load the data cube and labels data, data_info = read_segy(join(ROOT_PATH, INPUT_VOXEL)) train_class_imgs, train_coordinates = read_labels(join(ROOT_PATH, TRAIN_MASK), data_info) val_class_imgs, _ = read_labels(join(ROOT_PATH, VAL_MASK), data_info) # Plot training/validation data with labels if LOG_TENSORBOARD: for class_img in train_class_imgs + val_class_imgs: logger.log_images( class_img[1] + "_" + str(class_img[2]), get_slice(data, data_info, class_img[1], class_img[2]), cm="gray", ) logger.log_images( class_img[1] + "_" + str(class_img[2]) + "_true_class", class_img[0], ) # Training loop for i in range(5000):
from sklearn.metrics import confusion_matrix # project functions from data import setup, read_labels, read_samples from preprocessor import purge_HTML, html2txt, count_links, avg_sentence_len # Setup setup(dataDir) nTrain = 700 nTest = 500 # Build dataset labels = read_labels() pos = [k for k, v in labels.items() if v] neg = [k for k, v in labels.items() if not v] random.shuffle(pos) random.shuffle(neg) balanced_labels = {k: True for k in pos[:nTrain]} balanced_labels.update({k: False for k in neg[:nTest]}) trainId, testId = cross_validation.train_test_split(np.array( list(balanced_labels.keys())), train_size=nTrain, test_size=nTest, random_state=10) trainRaw = read_samples(trainId) testRaw = read_samples(testId) trainY = [labels[id] for id in trainId]
def main(): patterns = "5-50_4-50_3-50_2-50" pattern_specs = OrderedDict(sorted(([int(y) for y in x.split("-")] for x in patterns.split("_")), key=lambda t: t[0])) pre_computed_patterns = None n = None mlp_hidden_dim = 25 num_mlp_layers = 2 seed = 100 #Sets the seed for generating random numbers. torch.manual_seed(seed) #This method is called when RandomState is initialized. np.random.seed(seed) validation_data_file = "./soft_patterns/data/dev.data" dev_vocab = vocab_from_text(validation_data_file) # print(dev_vocab.index) print("Dev vocab size:", len(dev_vocab)) # exit(0) train_data_file = "./soft_patterns/data/train.data" train_vocab = vocab_from_text(train_data_file) print("Train vocab size:", len(train_vocab)) dev_vocab |= train_vocab embedding_file='./soft_patterns/glove.6B.50d.txt' vocab, embeddings, word_dim = read_embeddings(embedding_file, dev_vocab) num_padding_tokens = max(list(pattern_specs.keys())) - 1 dev_input, _ = read_docs(validation_data_file, vocab, num_padding_tokens=num_padding_tokens) validation_label_file = "./soft_patterns/data/dev.labels" dev_labels = read_labels(validation_label_file) dev_data = list(zip(dev_input, dev_labels)) np.random.shuffle(dev_data) num_iterations = 10 train_input, _ = read_docs(train_data_file, vocab, num_padding_tokens=num_padding_tokens) train_labels_file = "./soft_patterns/data/train.labels" train_labels = read_labels(train_labels_file) print("training instances:", len(train_input)) num_classes = len(set(train_labels)) train_data = list(zip(train_input, train_labels)) np.random.shuffle(train_data) print("num_classes:", num_classes) rnn = None semiring = Semiring(zeros, ones, torch.add, torch.mul, sigmoid, identity) model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab, semiring, 0.1, False, rnn, pre_computed_patterns, False, 0, False, None, None) model_file_prefix = "model" model_save_dir = "./soft_patterns/output/" print("Training with", model_file_prefix) train(train_data, dev_data, model, num_classes, model_save_dir, num_iterations, model_file_prefix, 0.001, 1, False, False, None, -1,0,0,0, 30) return 0