Пример #1
0
    def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None,
                 slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=5, **kwargs):

        if patient_ids:
            patient_paths = []
            for pid in patient_ids:
                patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            patient_paths = glob.glob(data_path + '/*/study/')

        self.pid2sax_slice_paths = defaultdict(list)
        self.pid2ch2_path, self.pid2ch4_path = {}, {}
        for p in patient_paths:
            pid = int(utils.get_patient_id(p))
            spaths = sorted(glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1)))
            if len(spaths) > min_slices:
                self.pid2sax_slice_paths[pid] = spaths

                ch2_path = glob.glob(p + '/2ch_*.pkl')
                self.pid2ch2_path[pid] = ch2_path[0] if ch2_path else None
                ch4_path = glob.glob(p + '/4ch_*.pkl')
                self.pid2ch4_path[pid] = ch4_path[0] if ch4_path else None

        self.patient_ids = self.pid2sax_slice_paths.keys()
        self.nsamples = len(self.patient_ids)

        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.batch_size = batch_size
        self.infinite = infinite
        self.transformation_params = transform_params
        self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
def main(args):
    print(args)

    if args.seed != -1:
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)

    pattern_specs = OrderedDict(sorted(([int(y) for y in x.split("-")] for x in args.patterns.split("_")),
                                       key=lambda t: t[0]))

    n = args.num_train_instances
    mlp_hidden_dim = args.mlp_hidden_dim
    num_mlp_layers = args.num_mlp_layers

    dev_vocab = vocab_from_text(args.vd)
    print("Dev vocab size:", len(dev_vocab))

    vocab, embeddings, word_dim = \
        read_embeddings(args.embedding_file, dev_vocab)

    num_padding_tokens = max(list(pattern_specs.keys())) - 1

    dev_input, dev_text = read_docs(args.vd, vocab, num_padding_tokens=num_padding_tokens)
    dev_labels = read_labels(args.vl)
    dev_data = list(zip(dev_input, dev_labels))
    if n is not None:
        dev_data = dev_data[:n]

    num_classes = len(set(dev_labels))
    print("num_classes:", num_classes)

    semiring = \
        MaxPlusSemiring if args.maxplus else (
            LogSpaceMaxTimesSemiring if args.maxtimes else ProbSemiring
        )

    if args.use_rnn:
        rnn = Rnn(word_dim,
                  args.hidden_dim,
                  cell_type=LSTM,
                  gpu=args.gpu)
    else:
        rnn = None

    model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab,
                                  semiring, args.bias_scale_param, args.gpu, rnn=rnn, pre_computed_patterns=None)

    if args.gpu:
        print("Cuda!")
        model.to_cuda(model)
        state_dict = torch.load(args.input_model)
    else:
        state_dict = torch.load(args.input_model, map_location=lambda storage, loc: storage)

    # Loading model
    model.load_state_dict(state_dict)

    interpret_documents(model, args.batch_size, dev_data, dev_text, args.ofile, args.max_doc_len)

    return 0
Пример #3
0
    def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None,
                 slice2roi_path=None, full_batch=False, random=True, infinite=False, view='sax',
                 data_prep_fun=data.transform_norm_rescale, **kwargs):

        if patient_ids:
            self.patient_paths = []
            for pid in patient_ids:
                self.patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            self.patient_paths = glob.glob(data_path + '/*/study/')

        self.slice_paths = [sorted(glob.glob(p + '/%s_*.pkl' % view)) for p in self.patient_paths]
        self.slice_paths = list(itertools.chain(*self.slice_paths))
        self.slicepath2pid = {}
        for s in self.slice_paths:
            self.slicepath2pid[s] = int(utils.get_patient_id(s))

        self.nsamples = len(self.slice_paths)
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.infinite = infinite
        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.transformation_params = transform_params
        self.data_prep_fun = data_prep_fun
        self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
def main():
    n = None
    mlp_hidden_dim = 25
    num_mlp_layers = 2

    validation_data_file = "./soft_patterns/data/test.data"
    dev_vocab = vocab_from_text(validation_data_file)
    print("Dev vocab size:", len(dev_vocab))

    embedding_file = "./soft_patterns/glove.6B.50d.txt"
    vocab, embeddings, word_dim = read_embeddings(embedding_file, dev_vocab)

    seed = 100
    torch.manual_seed(seed)
    np.random.seed(seed)

    patterns = "5-50_4-50_3-50_2-50"
    pattern_specs = OrderedDict(
        sorted(([int(y) for y in x.split("-")] for x in patterns.split("_")),
               key=lambda t: t[0]))
    num_padding_tokens = max(list(pattern_specs.keys())) - 1

    dev_input, _ = read_docs(validation_data_file,
                             vocab,
                             num_padding_tokens=num_padding_tokens)
    validation_label_file = "./soft_patterns/data/test.labels"
    dev_labels = read_labels(validation_label_file)
    dev_data = list(zip(dev_input, dev_labels))

    num_classes = len(set(dev_labels))
    print("num_classes:", num_classes)

    semiring = Semiring(zeros, ones, torch.add, torch.mul, sigmoid, identity)

    rnn = None

    model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim,
                                  num_mlp_layers, num_classes, embeddings,
                                  vocab, semiring, 0.1, False, rnn, None,
                                  False, 0, False, None, None)

    input_model = "./soft_patterns/output/model_9.pth"
    state_dict = torch.load(input_model,
                            map_location=lambda storage, loc: storage)

    model.load_state_dict(state_dict)

    test_acc = evaluate_accuracy(model, dev_data, 1, False)

    print("Test accuracy: {:>8,.3f}%".format(100 * test_acc))

    return 0
Пример #5
0
    def __init__(self,
                 data_path,
                 batch_size,
                 transform_params,
                 patient_ids=None,
                 labels_path=None,
                 slice2roi_path=None,
                 full_batch=False,
                 random=True,
                 infinite=True,
                 min_slices=0,
                 data_prep_fun=data.transform_norm_rescale,
                 **kwargs):

        if patient_ids:
            patient_paths = []
            for pid in patient_ids:
                patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            patient_paths = glob.glob(data_path + '/*/study/')

        self.pid2slice_paths = defaultdict(list)
        nslices = []
        for p in patient_paths:
            pid = int(utils.get_patient_id(p))
            spaths = sorted(
                glob.glob(p + '/sax_*.pkl'),
                key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1)))
            # consider patients only with min_slices
            if len(spaths) > min_slices:
                self.pid2slice_paths[pid] = spaths
                nslices.append(len(spaths))

        # take max number of slices
        self.nslices = int(np.max(nslices))

        self.patient_ids = self.pid2slice_paths.keys()
        self.nsamples = len(self.patient_ids)

        self.data_path = data_path
        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.batch_size = batch_size
        self.infinite = infinite
        self.transformation_params = transform_params
        self.data_prep_fun = data_prep_fun
        self.slice2roi = utils.load_pkl(
            slice2roi_path) if slice2roi_path else None
Пример #6
0
    def __init__(self,
                 data_path,
                 batch_size,
                 transform_params,
                 patient_ids=None,
                 labels_path=None,
                 slice2roi_path=None,
                 full_batch=False,
                 random=True,
                 infinite=True,
                 min_slices=5,
                 **kwargs):

        if patient_ids:
            patient_paths = []
            for pid in patient_ids:
                patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            patient_paths = glob.glob(data_path + '/*/study/')

        self.pid2sax_slice_paths = defaultdict(list)
        self.pid2ch2_path, self.pid2ch4_path = {}, {}
        for p in patient_paths:
            pid = int(utils.get_patient_id(p))
            spaths = sorted(
                glob.glob(p + '/sax_*.pkl'),
                key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1)))
            if len(spaths) > min_slices:
                self.pid2sax_slice_paths[pid] = spaths

                ch2_path = glob.glob(p + '/2ch_*.pkl')
                self.pid2ch2_path[pid] = ch2_path[0] if ch2_path else None
                ch4_path = glob.glob(p + '/4ch_*.pkl')
                self.pid2ch4_path[pid] = ch4_path[0] if ch4_path else None

        self.patient_ids = self.pid2sax_slice_paths.keys()
        self.nsamples = len(self.patient_ids)

        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.batch_size = batch_size
        self.infinite = infinite
        self.transformation_params = transform_params
        self.slice2roi = utils.load_pkl(
            slice2roi_path) if slice2roi_path else None
Пример #7
0
    def __init__(self,
                 data_path,
                 batch_size,
                 transform_params,
                 patient_ids=None,
                 labels_path=None,
                 slice2roi_path=None,
                 full_batch=False,
                 random=True,
                 infinite=False,
                 view='sax',
                 data_prep_fun=data.transform_norm_rescale,
                 **kwargs):

        if patient_ids:
            self.patient_paths = []
            for pid in patient_ids:
                self.patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            self.patient_paths = glob.glob(data_path + '/*/study/')

        self.slice_paths = [
            sorted(glob.glob(p + '/%s_*.pkl' % view))
            for p in self.patient_paths
        ]
        self.slice_paths = list(itertools.chain(*self.slice_paths))
        self.slicepath2pid = {}
        for s in self.slice_paths:
            self.slicepath2pid[s] = int(utils.get_patient_id(s))

        self.nsamples = len(self.slice_paths)
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.infinite = infinite
        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.transformation_params = transform_params
        self.data_prep_fun = data_prep_fun
        self.slice2roi = utils.load_pkl(
            slice2roi_path) if slice2roi_path else None
Пример #8
0
    def __init__(self, data_path, batch_size, transform_params, patient_ids=None, labels_path=None,
                 slice2roi_path=None, full_batch=False, random=True, infinite=True, min_slices=0,
                 data_prep_fun=data.transform_norm_rescale,
                 **kwargs):

        if patient_ids:
            patient_paths = []
            for pid in patient_ids:
                patient_paths.append(data_path + '/%s/study/' % pid)
        else:
            patient_paths = glob.glob(data_path + '/*/study/')

        self.pid2slice_paths = defaultdict(list)
        nslices = []
        for p in patient_paths:
            pid = int(utils.get_patient_id(p))
            spaths = sorted(glob.glob(p + '/sax_*.pkl'), key=lambda x: int(re.search(r'/sax_(\d+)\.pkl$', x).group(1)))
            # consider patients only with min_slices
            if len(spaths) > min_slices:
                self.pid2slice_paths[pid] = spaths
                nslices.append(len(spaths))

        # take max number of slices
        self.nslices = int(np.max(nslices))

        self.patient_ids = self.pid2slice_paths.keys()
        self.nsamples = len(self.patient_ids)

        self.data_path = data_path
        self.id2labels = data.read_labels(labels_path) if labels_path else None
        self.batch_size = batch_size
        self.rng = np.random.RandomState(42)
        self.full_batch = full_batch
        self.random = random
        self.batch_size = batch_size
        self.infinite = infinite
        self.transformation_params = transform_params
        self.data_prep_fun = data_prep_fun
        self.slice2roi = utils.load_pkl(slice2roi_path) if slice2roi_path else None
Пример #9
0
def main(args):
    print(args)
    n = args.num_train_instances
    if args.seed != -1:
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)

    dev_vocab = vocab_from_text(args.vd)
    print("Dev vocab:", len(dev_vocab))
    train_vocab = vocab_from_text(args.td)
    print("Train vocab:", len(train_vocab))
    dev_vocab |= train_vocab

    vocab, embeddings, word_dim = \
        read_embeddings(args.embedding_file, dev_vocab)

    num_padding_tokens = 1

    dev_input, dev_text = read_docs(args.vd,
                                    vocab,
                                    num_padding_tokens=num_padding_tokens)
    dev_labels = read_labels(args.vl)
    dev_data = list(zip(dev_input, dev_labels))

    np.random.shuffle(dev_data)
    train_input, _ = read_docs(args.td,
                               vocab,
                               num_padding_tokens=num_padding_tokens)
    train_labels = read_labels(args.tl)

    print("training instances:", len(train_input))

    num_classes = len(set(train_labels))

    # truncate data (to debug faster)
    train_data = list(zip(train_input, train_labels))
    np.random.shuffle(train_data)

    print("num_classes:", num_classes)

    if n is not None:
        train_data = train_data[:n]
        dev_data = dev_data[:n]

    dropout = None if args.td is None else args.dropout

    # TODO: GRU doesn't work yet
    cell_type = LSTM  # GRU if args.gru else LSTM

    model = AveragingRnnClassifier(args.hidden_dim,
                                   args.mlp_hidden_dim,
                                   args.num_mlp_layers,
                                   num_classes,
                                   embeddings,
                                   cell_type=cell_type,
                                   gpu=args.gpu)

    if args.gpu:
        model.to_cuda(model)

    model_file_prefix = 'model'
    # Loading model
    if args.input_model is not None:
        state_dict = torch.load(args.input_model)
        model.load_state_dict(state_dict)
        model_file_prefix = 'model_retrained'

    model_save_dir = args.model_save_dir

    if model_save_dir is not None:
        if not os.path.exists(model_save_dir):
            os.makedirs(model_save_dir)

    print("Training with", model_file_prefix)
    train(train_data,
          dev_data,
          model,
          num_classes,
          model_save_dir,
          args.num_iterations,
          model_file_prefix,
          args.learning_rate,
          args.batch_size,
          args.scheduler,
          gpu=args.gpu,
          clip=args.clip,
          debug=args.debug,
          dropout=dropout,
          word_dropout=args.word_dropout,
          patience=args.patience)
Пример #10
0
def main(args):
    print(args)

    pattern_specs = OrderedDict(
        sorted(
            ([int(y) for y in x.split("-")] for x in args.patterns.split("_")),
            key=lambda t: t[0]))

    pre_computed_patterns = None

    if args.pre_computed_patterns is not None:
        pre_computed_patterns = read_patterns(args.pre_computed_patterns,
                                              pattern_specs)
        pattern_specs = OrderedDict(
            sorted(pattern_specs.items(), key=lambda t: t[0]))

    n = args.num_train_instances
    mlp_hidden_dim = args.mlp_hidden_dim
    num_mlp_layers = args.num_mlp_layers

    if args.seed != -1:
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)

    dev_vocab = vocab_from_text(args.vd)
    print("Dev vocab size:", len(dev_vocab))
    train_vocab = vocab_from_text(args.td)
    print("Train vocab size:", len(train_vocab))
    dev_vocab |= train_vocab

    vocab, embeddings, word_dim = \
        read_embeddings(args.embedding_file, dev_vocab)

    num_padding_tokens = max(list(pattern_specs.keys())) - 1

    dev_input, _ = read_docs(args.vd,
                             vocab,
                             num_padding_tokens=num_padding_tokens)
    dev_labels = read_labels(args.vl)
    dev_data = list(zip(dev_input, dev_labels))

    np.random.shuffle(dev_data)
    num_iterations = args.num_iterations

    train_input, _ = read_docs(args.td,
                               vocab,
                               num_padding_tokens=num_padding_tokens)
    train_labels = read_labels(args.tl)

    print("training instances:", len(train_input))

    num_classes = len(set(train_labels))

    # truncate data (to debug faster)
    train_data = list(zip(train_input, train_labels))
    np.random.shuffle(train_data)

    print("num_classes:", num_classes)

    if n is not None:
        train_data = train_data[:n]
        dev_data = dev_data[:n]

    if args.use_rnn:
        rnn = Rnn(word_dim, args.hidden_dim, cell_type=LSTM, gpu=args.gpu)
    else:
        rnn = None

    semiring = \
        MaxPlusSemiring if args.maxplus else (
            LogSpaceMaxTimesSemiring if args.maxtimes else ProbSemiring
        )

    model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim,
                                  num_mlp_layers, num_classes, embeddings,
                                  vocab, semiring, args.bias_scale_param,
                                  args.gpu, rnn, pre_computed_patterns,
                                  args.no_sl, args.shared_sl, args.no_eps,
                                  args.eps_scale, args.self_loop_scale)

    if args.gpu:
        model.to_cuda(model)

    model_file_prefix = 'model'
    # Loading model
    if args.input_model is not None:
        state_dict = torch.load(args.input_model)
        model.load_state_dict(state_dict)
        model_file_prefix = 'model_retrained'

    model_save_dir = args.model_save_dir

    if model_save_dir is not None:
        if not os.path.exists(model_save_dir):
            os.makedirs(model_save_dir)

    print("Training with", model_file_prefix)

    train(train_data, dev_data, model, num_classes, model_save_dir,
          num_iterations, model_file_prefix, args.learning_rate,
          args.batch_size, args.scheduler, args.gpu, args.clip,
          args.max_doc_len, args.debug, args.dropout, args.word_dropout,
          args.patience)

    return 0
Пример #11
0
from sklearn.metrics import confusion_matrix

# project functions

from data import setup, read_labels, read_samples
from preprocessor import purge_HTML, html2txt, count_links, avg_sentence_len

# Setup

setup(dataDir)

nTrain = 700
nTest  = 500

# Build dataset
labels = read_labels()
pos = [k for k, v in labels.items() if v]
neg = [k for k, v in labels.items() if not v]
random.shuffle(pos)
random.shuffle(neg)
balanced_labels = {k: True for k in pos[:nTrain]}
balanced_labels.update({k: False for k in neg[:nTest]})

trainId, testId = cross_validation.train_test_split(
    np.array(list(balanced_labels.keys())), 
    train_size=nTrain, 
    test_size=nTest, 
    random_state=10)
trainRaw  = read_samples(trainId)
testRaw   = read_samples(testId)
trainY    = [labels[id] for id in trainId]
Пример #12
0
def main(args):
    with open(args.work_dir + "/train.data", encoding="ISO-8859-1") as ifh:
        wordcount = Counter(ifh.read().split())

    sum = np.sum(list(wordcount.values()))

    # print(sum)

    wordcount = {k: float(wordcount[k]) / int(sum) for k in wordcount.keys()}

    words = {
        k: Word(k, wordcount[k], args.fh, args.fc)
        for k in wordcount.keys()
    }

    patterns = dict()

    with open(args.work_dir + "/train.data",
              encoding='ISO-8859-1') as input_file:
        train_docs = [line.rstrip().split() for line in input_file]

    with open(args.work_dir + "/dev.data",
              encoding='ISO-8859-1') as input_file:
        dev_docs = [line.rstrip().split() for line in input_file]

    with open(args.work_dir + "/test.data",
              encoding='ISO-8859-1') as input_file:
        test_docs = [line.rstrip().split() for line in input_file]

    train_labels = read_labels(args.work_dir + "/train.labels")
    dev_labels = read_labels(args.work_dir + "/dev.labels")
    test_labels = read_labels(args.work_dir + "/test.labels")

    for doc in train_docs:
        add_patterns(doc, words, patterns, args.max_pattern_len,
                     args.use_CW_tokens, args.min_pattern_length)
        # sys.exit(-1)

    if args.min_pattern_frequency < 1:
        thr = args.min_pattern_frequency * len(train_docs)
    else:
        thr = args.min_pattern_frequency

    print("Read", len(patterns), "patterns")
    patterns = {k: patterns[k] for k in patterns.keys() if patterns[k] >= thr}

    s = 0
    for p in patterns.keys():
        p.set_freq(patterns[p])
        s += patterns[p]

    pattern_keys = list(patterns.keys())

    print("Read", len(patterns), "patterns", s)

    trie = build_trie(pattern_keys)

    # print(trie)

    # sys.exit(-1)

    # print([x.__str__() for x in patterns if x.size() >= 3])

    train_features = lil_matrix((len(train_docs), len(patterns)),
                                dtype=np.int8)
    dev_features = lil_matrix((len(dev_docs), len(patterns)))
    test_features = lil_matrix((len(test_docs), len(patterns)))

    for (i, doc) in enumerate(train_docs):
        add_patterns(doc, words, patterns, args.max_pattern_len,
                     args.use_CW_tokens, args.min_pattern_length, trie,
                     train_features, i)

    for (i, doc) in enumerate(dev_docs):
        add_patterns(doc, words, patterns, args.max_pattern_len,
                     args.use_CW_tokens, args.min_pattern_length, trie,
                     dev_features, i)

    for (i, doc) in enumerate(test_docs):
        add_patterns(doc, words, patterns, args.max_pattern_len,
                     args.use_CW_tokens, args.min_pattern_length, trie,
                     test_features, i)

    # print([x.__str__() for x in patterns.keys()])
    # print("df",dev_features)
    # print("tf", train_features)

    clf = train(train_features, train_labels, dev_features, dev_labels)

    gen_salient_patterns(train_features, clf, pattern_keys,
                         args.n_salient_features)

    if args.model_ofile is not None:
        print("Saving best model to", args.model_ofile)
        pickle.dump(clf, open(args.model_ofile, 'wb'))

    test_predicted_labels = clf.predict(test_features)
    test_acc = evaluate(test_predicted_labels, test_labels)

    print("Test accuracy: {}".format(test_acc))

    return 0
Пример #13
0
def main(args):
    print(args)

    n = args.num_train_instances
    mlp_hidden_dim = args.mlp_hidden_dim
    num_mlp_layers = args.num_mlp_layers

    dev_vocab = vocab_from_text(args.vd)
    print("Dev vocab size:", len(dev_vocab))

    vocab, embeddings, word_dim = \
        read_embeddings(args.embedding_file, dev_vocab)

    if args.seed != -1:
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)

    if args.dan or args.bilstm:
        num_padding_tokens = 1
    elif args.cnn:
        num_padding_tokens = args.window_size - 1
    else:
        pattern_specs = OrderedDict(sorted(([int(y) for y in x.split("-")] for x in args.patterns.split("_")),
                                           key=lambda t: t[0]))
        num_padding_tokens = max(list(pattern_specs.keys())) - 1

    dev_input, dev_text = read_docs(args.vd, vocab, num_padding_tokens=num_padding_tokens)
    dev_labels = read_labels(args.vl)
    dev_data = list(zip(dev_input, dev_labels))
    if n is not None:
        dev_data = dev_data[:n]

    num_classes = len(set(dev_labels))
    print("num_classes:", num_classes)

    if args.dan:
        model = DanClassifier(mlp_hidden_dim,
                              num_mlp_layers,
                              num_classes,
                              embeddings,
                              args.gpu)
    elif args.bilstm:
        cell_type = LSTM

        model = AveragingRnnClassifier(args.hidden_dim,
                                       mlp_hidden_dim,
                                       num_mlp_layers,
                                       num_classes,
                                       embeddings,
                                       cell_type=cell_type,
                                       gpu=args.gpu)
    elif args.cnn:
        model = PooledCnnClassifier(args.window_size,
            args.num_cnn_layers,
            args.cnn_hidden_dim,
            num_mlp_layers,
            mlp_hidden_dim,
            num_classes,
            embeddings,
            pooling=max_pool_seq,
            gpu=args.gpu)
    else:
        semiring = \
            MaxPlusSemiring if args.maxplus else (
                LogSpaceMaxTimesSemiring if args.maxtimes else ProbSemiring
            )

        if args.use_rnn:
            rnn = Rnn(word_dim,
                      args.hidden_dim,
                      cell_type=LSTM,
                      gpu=args.gpu)
        else:
            rnn = None

        model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab,
                                      semiring, args.bias_scale_param, args.gpu, rnn, None, args.no_sl, args.shared_sl,
                                      args.no_eps, args.eps_scale, args.self_loop_scale)

    if args.gpu:
        state_dict = torch.load(args.input_model)
    else:
        state_dict = torch.load(args.input_model, map_location=lambda storage, loc: storage)

    model.load_state_dict(state_dict)

    if args.gpu:
        model.to_cuda(model)

    test_acc = evaluate_accuracy(model, dev_data, args.batch_size, args.gpu)

    print("Test accuracy: {:>8,.3f}%".format(100*test_acc))

    return 0
Пример #14
0
# This is the network definition proposed in the paper
network = TextureNet(n_classes=2)

# Loss function - Softmax function is included
cross_entropy = nn.CrossEntropyLoss()

# Optimizer to control step size in gradient descent
optimizer = torch.optim.Adam(network.parameters())

# Transfer model to gpu
if USE_GPU and torch.cuda.is_available():
    network = network.cuda()

# Load the data cube and labels
data, data_info = read_segy(join(ROOT_PATH, INPUT_VOXEL))
train_class_imgs, train_coordinates = read_labels(join(ROOT_PATH, TRAIN_MASK), data_info)
val_class_imgs, _ = read_labels(join(ROOT_PATH, VAL_MASK), data_info)

# Plot training/validation data with labels
if LOG_TENSORBOARD:
    for class_img in train_class_imgs + val_class_imgs:
        logger.log_images(
            class_img[1] + "_" + str(class_img[2]), get_slice(data, data_info, class_img[1], class_img[2]), cm="gray",
        )
        logger.log_images(
            class_img[1] + "_" + str(class_img[2]) + "_true_class", class_img[0],
        )

# Training loop
for i in range(5000):
Пример #15
0
from sklearn.metrics import confusion_matrix

# project functions

from data import setup, read_labels, read_samples
from preprocessor import purge_HTML, html2txt, count_links, avg_sentence_len

# Setup

setup(dataDir)

nTrain = 700
nTest = 500

# Build dataset
labels = read_labels()
pos = [k for k, v in labels.items() if v]
neg = [k for k, v in labels.items() if not v]
random.shuffle(pos)
random.shuffle(neg)
balanced_labels = {k: True for k in pos[:nTrain]}
balanced_labels.update({k: False for k in neg[:nTest]})

trainId, testId = cross_validation.train_test_split(np.array(
    list(balanced_labels.keys())),
                                                    train_size=nTrain,
                                                    test_size=nTest,
                                                    random_state=10)
trainRaw = read_samples(trainId)
testRaw = read_samples(testId)
trainY = [labels[id] for id in trainId]
Пример #16
0
def main():
    patterns = "5-50_4-50_3-50_2-50"
    pattern_specs = OrderedDict(sorted(([int(y) for y in x.split("-")] for x in patterns.split("_")), key=lambda t: t[0]))

    pre_computed_patterns = None
    n = None
    mlp_hidden_dim = 25
    num_mlp_layers = 2

    seed = 100
    #Sets the seed for generating random numbers.
    torch.manual_seed(seed)
    #This method is called when RandomState is initialized.
    np.random.seed(seed)

    validation_data_file = "./soft_patterns/data/dev.data"
    dev_vocab = vocab_from_text(validation_data_file)
    # print(dev_vocab.index)
    print("Dev vocab size:", len(dev_vocab))
    # exit(0)
    train_data_file = "./soft_patterns/data/train.data"
    train_vocab = vocab_from_text(train_data_file)
    print("Train vocab size:", len(train_vocab))
    dev_vocab |= train_vocab


    embedding_file='./soft_patterns/glove.6B.50d.txt'
    vocab, embeddings, word_dim = read_embeddings(embedding_file, dev_vocab)

    num_padding_tokens = max(list(pattern_specs.keys())) - 1

    dev_input, _ = read_docs(validation_data_file, vocab, num_padding_tokens=num_padding_tokens)
    validation_label_file = "./soft_patterns/data/dev.labels"
    dev_labels = read_labels(validation_label_file)
    dev_data = list(zip(dev_input, dev_labels))

    np.random.shuffle(dev_data)
    num_iterations = 10

    train_input, _ = read_docs(train_data_file, vocab, num_padding_tokens=num_padding_tokens)
    train_labels_file = "./soft_patterns/data/train.labels"
    train_labels = read_labels(train_labels_file)

    print("training instances:", len(train_input))

    num_classes = len(set(train_labels))

    train_data = list(zip(train_input, train_labels))
    np.random.shuffle(train_data)

    print("num_classes:", num_classes)
    rnn = None
    semiring = Semiring(zeros, ones, torch.add, torch.mul, sigmoid, identity)

    model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab, semiring, 0.1, False, rnn, pre_computed_patterns, False, 0, False, None, None)

    model_file_prefix = "model"
    model_save_dir = "./soft_patterns/output/"

    print("Training with", model_file_prefix)

    train(train_data, dev_data, model, num_classes, model_save_dir, num_iterations, model_file_prefix, 0.001, 1, False, False, None, -1,0,0,0, 30)

    return 0