Пример #1
0
def filter3():
    dir3 = "%s/FSUBTEST/3" % params.JADER_OUT
    utils.ensure_dir(dir3)

    dDrug1Se = dict()
    dDrug2Se = dict()
    fin1 = open("%s/FSUBTEST/1/1.txt" % params.JADER_OUT)
    fin2 = open("%s/FSUBTEST/2/2.txt" % params.JADER_OUT)

    while True:
        line = fin1.readline()
        if line == "":
            break
        parts = line.strip().split("\t")
        drug = parts[0]
        ses = set(parts[1].split(","))
        dDrug1Se[drug] = ses
    fin1.close()

    while True:
        line = fin2.readline()
        if line == "":
            break
        parts = line.strip().split("\t")
        drug = parts[0]
        ses = set(parts[1].split(","))
        dDrug2Se[drug] = ses
    fin1.close()

    fin = open("%s/SUB/3" % params.JADER_OUT)
    fout = open("%s/SUB/F3" % params.JADER_OUT, "w")
    while True:
        line = fin.readline()
        if line == "":
            break
        line = line.strip()
        parts = line.split("$")
        dDrug = parts[0].split(",")
        ses = parts[1].split(",")
        invalidSes = set()
        for drug in dDrug:
            sD = utils.get_dict(dDrug1Se, drug, set())
            for s in sD:
                invalidSes.add(s)
        drugS = sorted(dDrug)
        drugPairs = []
        for i in range(len(drugS)):
            for j in range(i + 1, len(drugS)):
                pair = "%s,%s" % (drugS[i], drugS[j])
                drugPairs.append(pair)
        for pair in drugPairs:
            sD = utils.get_dict(dDrug2Se, pair, set())
            for s in sD:
                invalidSes.add(s)
        validSes = []
        for se in ses:
            # if se not in invalidSes:
            validSes.append(se)
        fout.write("%s$%s\n" % (parts[0], ",".join(validSes)))
    fout.close()
def builder():
    args = _get_parser()
    check_file(args.infile)
    ensure_dir(args.output)

    A = ahocorasick.Automaton()
    origin, annotation = list(), list()

    infile = open(args.infile, 'r', encoding='utf-8')
    for line in infile:
        line = line.rstrip()
        if not line:
            continue
        phrase, means = line.split(':::')
        if not phrase or not means:
            continue
        origin.append(phrase)
        annotation.append(means)

    infile.close()
    assert len(origin) == len(annotation)

    for idx, phrase in enumerate(origin):
        A.add_word(phrase, (idx, phrase))

    A.make_automaton()

    ac_name = os.path.join(args.output, args.ac_name)
    means = os.path.join(args.output, args.mean_name)
    with open(ac_name, 'wb') as outfile:
        pickle.dump(A, outfile, protocol=pickle.HIGHEST_PROTOCOL)
    with open(means, 'wb') as outfile:
        pickle.dump(annotation, outfile, protocol=pickle.HIGHEST_PROTOCOL)
Пример #3
0
    def __init__(self, model="FFNN"):
        resetRandomSeed()

        self.data = None

        utils.ensure_dir("%s/logs" % params.C_DIR)

        PREX = "FDA"
        if model == "FFNN":
            self.model = FFNNModel()
        elif model == "MILI":
            self.model = MILIModel()
        elif model == "MIL":
            self.model = MILModel()
        logPath = "%s/logs/%s_%s_%s" % (params.C_DIR, PREX, self.model.name,
                                        utils.getCurrentTimeString())

        self.logger = MyLogger(logPath)
        self.model.setLogger(self.logger)
        self.logger.infoAll((self.model.name))
        # self.logger.infoAll(self.model.model.named_parameters())
        self.logger.infoAll(
            ("LAYERS, EMBEDDING_SIZE, WEIGHT, BATCH_SIZE", params.N_LAYER,
             params.EMBEDDING_SIZE, params.WEIGHT_ZERO, params.BATCH_SIZE))
        self.logger.infoAll(("NCHANELS, DK, MAX DRUG: ", params.N_CHANEL,
                             params.DK, params.MAX_N_DRUG))
Пример #4
0
    def generate_tfrecords(self):
        utils.ensure_dir(self.outputDir)
        utils.ensure_dir(self.outputImagePathTR)
        utils.ensure_dir(self.outputImagePathVL)
        utils.ensure_dir(self.outputimagePathTS)

        
        num_of_images_for_train_and_val=self.PATIENT_END_TRAINING-self.PATIENT_START_TRAINING+1
        images_for_validation=int(utils.percentage(self.percent_for_validation,num_of_images_for_train_and_val))
        #Get random list of files for validation.
        validation_list = random.sample(range(self.PATIENT_START_TRAINING, self.PATIENT_END_TRAINING+1),images_for_validation)   
        
        #train and validation
        for i,j in enumerate(list(range(self.PATIENT_START_TRAINING, self.PATIENT_END_TRAINING+1))):
            InputFileVolume=os.path.join(self.imagePathTR,"BRATS_%03d.nii.gz" % (j))
            InputFileLabel=os.path.join(self.labelPathTR,"BRATS_%03d.nii.gz" % (j))
            if os.path.isfile(InputFileVolume) and os.path.isfile(InputFileLabel):
                if j in validation_list:
                    self.generate_tfrecord_from_patient(InputFileVolume,InputFileLabel,False,True,False)
                else:
                    self.generate_tfrecord_from_patient(InputFileVolume,InputFileLabel,True,False,False)
                        
        #test
        for i,j in enumerate(list(range(self.PATIENT_START_TEST, self.PATIENT_END_TEST+1))):
            InputFileVolume=os.path.join(self.imagePathTS,"BRATS_%03d.nii.gz" % (j))
            if os.path.isfile(InputFileVolume) and os.path.isfile(InputFileLabel):
                self.generate_tfrecord_from_patient(InputFileVolume,"",False,False,True)
Пример #5
0
def init_logger(log_name, log_dir):
    """
    日志模块
    1. 同时将日志打印到屏幕跟文件中
    2. 默认值保留近30天日志文件
    """
    ensure_dir(log_dir)
    if log_name not in Logger.manager.loggerDict:
        logger = logging.getLogger(log_name)
        logger.setLevel(logging.DEBUG)
        handler = TimedRotatingFileHandler(
            filename=os.path.join(log_dir, "%s.log" % log_name),
            when="D",
            backupCount=30,
        )
        datefmt = "%Y-%m-%d %H:%M:%S"
        format_str = "[%(asctime)s]: %(name)s %(filename)s[line:%(lineno)s] %(levelname)s  %(message)s"
        formatter = logging.Formatter(format_str, datefmt)
        handler.setFormatter(formatter)
        handler.setLevel(logging.INFO)
        logger.addHandler(handler)
        console = logging.StreamHandler()
        console.setLevel(logging.INFO)
        console.setFormatter(formatter)
        logger.addHandler(console)

        handler = TimedRotatingFileHandler(
            filename=os.path.join(log_dir, "ERROR.log"),
            when="D",
            backupCount=30,
        )
        datefmt = "%Y-%m-%d %H:%M:%S"
        format_str = "[%(asctime)s]: %(name)s %(filename)s[line:%(lineno)s] %(levelname)s  %(message)s"
        formatter = logging.Formatter(format_str, datefmt)
        handler.setFormatter(formatter)
        handler.setLevel(logging.ERROR)
        logger.addHandler(handler)
    logger = logging.getLogger(log_name)
    return logger
Пример #6
0
def filterg2():
    dir2 = "%s/FSUBTEST/2" % params.JADER_OUT
    utils.ensure_dir(dir2)

    dDrug1Se = dict()
    fin = open("%s/FSUBTEST/1/1.txt" % params.JADER_OUT)
    while True:
        line = fin.readline()
        if line == "":
            break
        parts = line.strip().split("\t")
        drug = parts[0]
        ses = set(parts[1].split(","))
        dDrug1Se[drug] = ses
    fin.close()
    fin = open("%s/SUB/G2" % params.JADER_OUT)
    fout = open("%s/SUB/GF2" % params.JADER_OUT, "w")
    while True:
        line = fin.readline()
        if line == "":
            break
        line = line.strip()
        parts = line.split("$")
        dDrug = parts[0].split(",")
        ses = parts[1].split(",")
        invalidSes = set()
        for drug in dDrug:
            sD = utils.get_dict(dDrug1Se, drug, set())
            for s in sD:
                invalidSes.add(s)
        validSes = []
        for se in ses:
            if se not in invalidSes:
                validSes.append(se)
        fout.write("%s$%s\n" % (parts[0], ",".join(validSes)))
    fout.close()
Пример #7
0
    def __init__(
        self,
        checkpoint_dir,
        monitor,
        logger,
        arch,
        save_best_only=True,
        best_model_name=None,
        epoch_model_name=None,
        mode="min",
        epoch_freq=1,
        best=None,
    ):
        self.monitor = monitor
        self.checkpoint_dir = checkpoint_dir
        self.save_best_only = save_best_only
        self.epoch_freq = epoch_freq
        self.arch = arch
        self.logger = logger
        self.best_model_name = best_model_name
        self.epoch_model_name = epoch_model_name
        self.use = "on_epoch_end"
        self.default_model_name = "pytorch_model.bin"

        if mode == "min":
            self.monitor_op = np.less
            self.best = np.Inf

        elif mode == "max":
            self.monitor_op = np.greater
            self.best = -np.Inf

        if best:
            self.best = best

        ensure_dir(self.checkpoint_dir)
Пример #8
0
    pin_memory=True
)

test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=10,
    shuffle=False, num_workers=4,
    worker_init_fn=lambda id: utils.set_seed(seed+id)
)

# %%
if args.arch not in ['resnet18', 'resnet50']:
    print(f'Unkown arch {args.arch}')
    exit(0)

name = f'{args.arch}-unprocessed/{args.pretrain}/{args.train}'
utils.ensure_dir(f'logs/{vars.corda_version}/{name}')
utils.ensure_dir(f'models/{vars.corda_version}/{args.arch}-unprocessed/{args.pretrain}')

train_df.to_csv(f'logs/{vars.corda_version}/{name}/train.csv', index=False)
val_df.to_csv(f'logs/{vars.corda_version}/{name}/val.csv', index=False)
test_df.to_csv(f'logs/{vars.corda_version}/{name}/test.csv', index=False)

with open(f'logs/{vars.corda_version}/{name}/stats.txt', 'w') as f:
    f.write(f'Mean, std: {mean}, {std}\n')
    f.write(f'LR: {args.lr}, epochs: {args.epochs}\n')
    f.write(f'CORDA dataset size: {len(corda_df)} \n\n')

    train_cov_size = [
        len(train_df[train_df.covid == 0]),
        len(train_df[train_df.covid == 1])
    ]
Пример #9
0
def ensureDIR():
    utils.ensure_dir("%s/FSUBTEST" % params.JADER_OUT)
    utils.ensure_dir("%s/FSUBTEST/1" % params.JADER_OUT)
    utils.ensure_dir("%s/FSUBTEST/2" % params.JADER_OUT)

    utils.ensure_dir("%s/SUB" % params.JADER_OUT)
Пример #10
0
def main():
    args = _get_parser()

    # preliminary work
    check_file(args.infile)
    ensure_dir(args.output)

    if args.name_len_update:
        line_cnt = line_counter(args.infile)
        args.name_len = len(str(line_cnt)) + 1

    clean_dir(args.output, args.name_len)
    # end preliminary work

    p_bucket = defaultdict(list)
    save_idx = 0
    id_name = '{0:0' + str(args.name_len) + 'd}'
    # load stop words
    stop_words = get_stop_words(args.stop_words) if os.path.exists(
        args.stop_words) else list()
    # load tokenizer
    seg = Segmentor(args)

    print('Splitting sentence into different clusters ...')
    infile = open(args.infile, 'r', encoding="utf-8")
    for line in tqdm(infile):
        line = line.rstrip()
        is_match = False
        seg_list = list(seg.cut(line))
        if stop_words:
            seg_list = list(filter(lambda x: x not in stop_words, seg_list))
        for wd in seg_list:
            if is_match:
                break
            w_bucket = p_bucket[wd]
            for bucket in w_bucket:
                bucket_path = os.path.join(args.output, bucket)
                check_file(bucket_path)
                selected = sample_file(bucket_path, args.sample_number)
                selected = list(map(lambda x: list(seg.cut(x)), selected))
                # remove stop words
                if stop_words:
                    filt_selected = list()
                    for sen in selected:
                        sen = list(filter(lambda x: x not in stop_words, sen))
                        filt_selected.append(sen)
                    selected = filt_selected
                # calculate similarity with each bucket
                if all(
                        jaccard(seg_list, cmp_list) > args.threshold
                        for cmp_list in selected):
                    is_match = True
                    with open(bucket_path, 'a', encoding='utf-8') as outfile:
                        outfile.write(line + '\n')
                    for w in seg_list:
                        if bucket not in p_bucket[w]:
                            p_bucket[w].append(bucket)
                    break
        if not is_match:
            bucket_name = ('tmp' + id_name).format(save_idx)
            bucket_path = os.path.join(args.output, bucket_name)
            with open(bucket_path, 'a', encoding='utf-8') as outfile:
                outfile.write(line + '\n')
            for w in seg_list:
                p_bucket[w].append(bucket_name)
            save_idx += 1

    infile.close()

    # sort and rename file
    file_list = os.listdir(args.output)
    file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
    cnt = dict()
    for file in file_list:
        file_path = os.path.join(args.output, file)
        cnt[file] = line_counter(file_path)

    sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
    for idx, (file_name, times) in enumerate(sorted_cnt):
        origin_path = os.path.join(args.output, file_name)
        new_path = os.path.join(args.output, id_name.format(idx))
        os.rename(origin_path, new_path)

    print('All is well')
Пример #11
0
def main(argv=None):
    config = SafeConfigParser()
    config.read(cmd_args.config_path)
    if cmd_args.restore_checkpoint:
        print('Skipping training phase, loading model checkpoint from: ', 
            config.get('main', 'checkpoint_path'))

    # Get the data.
    train_data_filename = utils.maybe_download(config, 
        config.get('data', 'train_data_filename'))
    train_labels_filename = utils.maybe_download(config, 
        config.get('data', 'train_labels_filename'))
    test_data_filename = utils.maybe_download(config, 
        config.get('data', 'test_data_filename'))
    test_labels_filename = utils.maybe_download(config, 
        config.get('data', 'test_labels_filename'))

    # Extract it into np arrays.
    train_data = utils.extract_data(config, train_data_filename, 60000)
    train_labels = utils.extract_labels(train_labels_filename, 60000)
    test_data = utils.extract_data(config, test_data_filename, 10000)
    test_labels = utils.extract_labels(test_labels_filename, 10000)

    validation_size = config.getint('main', 'validation_size')
    num_epochs = config.getint('main', 'num_epochs')

    # Generate a validation set.
    validation_data = train_data[:validation_size, ...]
    validation_labels = train_labels[:validation_size]
    train_data = train_data[validation_size:, ...]
    train_labels = train_labels[validation_size:]
    num_epochs = num_epochs
    train_size = train_labels.shape[0]

    lenet5 = LeNet5(config)

    x, y_ = lenet5.train_input_placeholders()
    y_conv, logits, keep_prob, param_dict = lenet5.model(x)

    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_))

    # L2 regularization for the fully connected parameters.
    regularizers = (tf.nn.l2_loss(param_dict['fc1_W']) 
                  + tf.nn.l2_loss(param_dict['fc1_b']) 
                  + tf.nn.l2_loss(param_dict['fc2_W']) 
                  + tf.nn.l2_loss(param_dict['fc2_b']))
    # Add the regularization term to the loss.
    loss += 5e-4 * regularizers

    # Optimizer: set up a variable that's incremented once 
    # per batch and controls the learning rate decay.
    batch = tf.Variable(0, dtype=tf.float32)

    # Decay once per epoch, using an exponential schedule starting at 0.01.
    learning_rate = tf.train.exponential_decay(
        0.01,
        batch * config.getint('main', 'batch_size'),
        train_size,
        0.95,
        staircase=True)

    optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) \
        .minimize(loss, global_step=batch)

    input_dict = {
        "x": x,
        "y_": y_,
        "y_conv": y_conv,
        "keep_prob": keep_prob,
        "train_data": train_data,
        "train_labels": train_labels,
        "test_data": test_data,
        "test_labels": test_labels,
        "validation_data": validation_data,
        "validation_labels": validation_labels,
        "num_epochs": num_epochs,
        "train_size": train_size
    }

    saver = tf.train.Saver(tf.all_variables())

    evaluator = Evaluator(cmd_args, config, optimizer, 
        learning_rate, loss, saver)
    evaluator.run(input_dict)

    fastgradientsign_advgen = FastGradientSign_AdvGen(cmd_args, [1, 28, 28, 1], saver, config)
    adv_out_df = fastgradientsign_advgen.run(input_dict)

    pkl_path = config.get('main', 'pickle_filepath')
    utils.ensure_dir(os.path.dirname(pkl_path))
    with open(pkl_path, "wb") as pkl:
        pickle.dump(adv_out_df, pkl)
Пример #12
0
import logging
import os

parser = argparse.ArgumentParser()
parser.add_argument('--n_epochs', type=int, default=50)
parser.add_argument('--save_dir', type=str, default='saved/')
parser.add_argument('--batch', type=int, default=128)
parser.add_argument('--nemb', type=int, default=22)
parser.add_argument('--method', type=str, default='acgan')
parser.add_argument('--glr', type=float, default=0.0002)
parser.add_argument('--dlr', type=float, default=0.0002)
parser.add_argument('--nf', type=int, default=64)
parser.add_argument('--checkpoint', type=str, default=None)

opt = parser.parse_args()
utils.ensure_dir(opt.save_dir)

handlers = [
    logging.FileHandler(os.path.join(opt.save_dir, 'output.log'), mode='w'),
    logging.StreamHandler()
]
logging.basicConfig(handlers=handlers, level=logging.INFO, format='')
logger = logging.getLogger()

NOISE_DIM = 100
NF = opt.nf
N_EMB = opt.nemb

if __name__ == '__main__':
    L = DataLoader(data_dir='data/',
                   n_emb=N_EMB,
Пример #13
0
def main():
    global connection, cursor
    cpu = multiprocessing.cpu_count()
    print("CPU {}".format(cpu))
    # preliminary work
    check_file(args.infile)
    ensure_dir(args.output)
    all_lines = 0
    if args.name_len_update:
        line_cnt = line_counter(args.infile)
        args.name_len = len(str(line_cnt)) + 1

    clean_dir(args.output, args.name_len)
    # end preliminary work

    all_bucked = defaultdict(list)
    p_bucket = defaultdict(list)
    save_idx = 0
    id_name = '{0:0' + str(args.name_len) + 'd}'

    # load tokenizer

    print('Splitting sentence into different clusters ...')
    infile = open(args.infile, 'r', encoding="utf-8")
    i = 0
    all_data = infile.readlines()
    n = 10000  # 大列表中几个数据组成一个小列表
    lstgs = [all_data[i:i + n] for i in range(0, len(all_data), n)]
    print(len(lstgs))
    r = []
    tr = []
    pool = multiprocessing.Pool(processes=4)
    for xyz in lstgs:
        tr.append(pool.apply_async(fenci, (xyz, )))
    pool.close()
    pool.join()

    for res in tr:
        tmp = res.get()
        for z in tmp:
            if z not in jieba_cache.keys():
                jieba_cache[z] = tmp[z]
            else:
                print(z)
    for st in stop_words:
        stop_words_cache[st] = 1

    r.clear()
    r = None

    all_lines = len(jieba_cache)
    print("开始执行 总 {} 行".format(all_lines))
    print("缓存成功jieba {}".format(len(jieba_cache)))
    print("缓存成功停用词 {}".format(len(stop_words_cache)))
    all_data = jieba_cache.keys()
    for inline in all_data:
        if inline == '太原去贵阳怎么走':
            print("")
        i = i + 1
        print("当前第 {} 行----总 {}".format(i, all_lines))
        inline = inline.rstrip()
        line = inline.split(':::')[0]
        is_match = False
        seg_list = jieba_cache[line]
        llll = []
        if stop_words:
            for mmmm in seg_list:
                if mmmm not in stop_words_cache.keys():
                    llll.append(mmmm)
            seg_list = llll
        for wd in seg_list:
            if is_match:
                break
            w_bucket = p_bucket[wd]
            for bucket in w_bucket:
                array = all_bucked[bucket]
                selected = sample_dict(array, args.sample_number)
                selected = list(map(lambda x: x.split(':::')[0], selected))
                selected = list(map(lambda x: jieba_cache[x], selected))
                # remove stop words
                if stop_words:
                    filt_selected = list()
                    for sen in selected:
                        llll = []
                        for mmmm in sen:
                            if mmmm not in stop_words_cache.keys():
                                llll.append(mmmm)
                        filt_selected.append(llll)
                    selected = filt_selected
                # calculate similarity with each bucket
                if all(
                        jaccard(seg_list, cmp_list) > args.threshold
                        for cmp_list in selected):
                    is_match = True
                    all_bucked[bucket].append(line)
                    for w in seg_list:
                        if bucket not in p_bucket[w]:
                            p_bucket[w].append(bucket)
                    break
                # print("{} jaccard耗时 {}".format( inline, endtime - starttime))
        if not is_match:
            bucket_name = ('tmp' + id_name).format(save_idx)
            bucket_array = [line]
            all_bucked[bucket_name] = bucket_array
            for w in seg_list:
                p_bucket[w].append(bucket_name)
            save_idx += 1

    infile.close()

    batch_size = 0
    for zzzz in all_bucked:
        batch_size = batch_size + 1
        connection = pymysql.connect(host='47.99.87.74',
                                     user='******',
                                     password='******',
                                     db='august',
                                     port=33306)
        cursor = connection.cursor()

        all_bucked_data = []
        for zx in all_bucked[zzzz]:
            all_bucked_data.append([all_bucked[zzzz][0], zx, today])
        print("当前批次  {} 共 {}".format(batch_size, len(all_bucked)))
        cursor.executemany(
            "insert into 凤巢长尾词分组(group_id,keyword,created_date) values(%s,%s,%s)",
            (all_bucked_data))
        connection.commit()
        cursor.close()
        connection.close()

    print('All is well')
Пример #14
0
    def run(self, questions):
        args = self._get_parser()

        # preliminary work
        ensure_dir(args.output)

        if args.name_len_update:
            line_cnt = line_counter(args.infile)
            args.name_len = len(str(line_cnt)) + 1

        clean_dir(args.output, args.name_len)
        # end preliminary work

        p_bucket = defaultdict(list)
        save_idx = 0
        id_name = '{0:0' + str(args.name_len) + 'd}'
        # load stop words
        stop_words = get_stop_words(args.stop_words) if os.path.exists(
            args.stop_words) else list()
        # load tokenizer
        seg = Segmentor(args)

        print('Splitting sentence into different clusters ...')
        infile = questions
        for inline in tqdm(infile):
            inline = inline.rstrip()
            line = inline.split(':::')[0]
            is_match = False
            seg_list = list(seg.cut(line))
            if stop_words:
                seg_list = list(filter(lambda x: x not in stop_words,
                                       seg_list))
            for wd in seg_list:
                if is_match:
                    break
                w_bucket = p_bucket[wd]
                for bucket in w_bucket:
                    bucket_path = os.path.join(args.output, bucket)
                    check_file(bucket_path)
                    selected = sample_file(bucket_path, args.sample_number)
                    selected = list(map(lambda x: x.split(':::')[0], selected))
                    selected = list(map(lambda x: list(seg.cut(x)), selected))
                    # remove stop words
                    if stop_words:
                        filt_selected = list()
                        for sen in selected:
                            sen = list(
                                filter(lambda x: x not in stop_words, sen))
                            filt_selected.append(sen)
                        selected = filt_selected
                    # calculate similarity with each bucket
                    if all(
                            jaccard(seg_list, cmp_list) > args.threshold
                            for cmp_list in selected):
                        is_match = True
                        with open(bucket_path, 'a',
                                  encoding='utf-8') as outfile:
                            outfile.write(line + '\n')
                        for w in seg_list:
                            if bucket not in p_bucket[w]:
                                p_bucket[w].append(bucket)
                        break
            if not is_match:
                bucket_name = ('tmp' + id_name).format(save_idx)
                bucket_path = os.path.join(args.output, bucket_name)
                with open(bucket_path, 'a', encoding='utf-8') as outfile:
                    outfile.write(line + '\n')
                for w in seg_list:
                    p_bucket[w].append(bucket_name)
                save_idx += 1

        # sort and rename file
        file_list = os.listdir(args.output)
        file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
        cnt = dict()
        for file in file_list:
            file_path = os.path.join(args.output, file)
            cnt[file] = line_counter(file_path)

        sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
        name_map = dict()
        for idx, (file_name, times) in enumerate(sorted_cnt):
            origin_path = os.path.join(args.output, file_name)
            new_name = id_name.format(idx)
            new_path = os.path.join(args.output, new_name)
            os.rename(origin_path, new_path)
            name_map[file_name] = new_name

        for k, v in p_bucket.items():
            p_bucket[k] = list(map(lambda x: name_map[x], v))

        #合并文件
        output_file = os.path.join(args.output, 'all_cluster.txt')
        try:
            if os.path.isfile(output_file):
                os.unlink(output_file)
        except Exception as e:
            print(e)
        file_list = os.listdir(args.output)
        fw = open(output_file, 'w+')
        for file in file_list:
            with open(os.path.join(args.output, file)) as f:
                for line in f.readlines():
                    fw.write(str(int(file)) + ',' + line)
        fw.close()
        df = pd.read_csv(output_file, names=['id', 'text'])
        df.columns = ['cluster_id', 'ques']
        print('All is well')
        # json.dumps(dict(ques=ques))
        df_dict = df.set_index('cluster_id').T.to_dict('records')[0]

        #dataframe 的数据格式转换
        #df 0 aa
        #   0 aaa                   => aa  [aaa]
        #   1 bb                       bb  []
        #df_dict = {0: aa, 1: bb}
        print(df_dict)
        result_dict = {}
        for cluster_id, ques in df_dict.items():
            li = df[df['cluster_id'] == cluster_id].ques.values.tolist()
            # if(ques in li): li.remove(ques)
            result_dict[ques] = li

        my_list = [result_dict]
        my_df = pd.DataFrame(my_list).T
        my_df = my_df.reset_index()
        my_df.columns = ['ques', 'info']
        print(my_df)
        return my_df.to_json(orient="records", force_ascii=False)
Пример #15
0
def main(argv=None):
    config = SafeConfigParser()
    config.read(cmd_args.config_path)
    if cmd_args.restore_checkpoint:
        print('Skipping training phase, loading model checkpoint from: ',
              config.get('main', 'checkpoint_path'))

    # Get the data.
    train_data_filename = utils.maybe_download(
        config, config.get('data', 'train_data_filename'))
    train_labels_filename = utils.maybe_download(
        config, config.get('data', 'train_labels_filename'))
    test_data_filename = utils.maybe_download(
        config, config.get('data', 'test_data_filename'))
    test_labels_filename = utils.maybe_download(
        config, config.get('data', 'test_labels_filename'))

    # Extract it into np arrays.
    train_data = utils.extract_data(config, train_data_filename, 60000)
    train_labels = utils.extract_labels(train_labels_filename, 60000)
    test_data = utils.extract_data(config, test_data_filename, 10000)
    test_labels = utils.extract_labels(test_labels_filename, 10000)

    validation_size = config.getint('main', 'validation_size')
    num_epochs = config.getint('main', 'num_epochs')

    # Generate a validation set.
    validation_data = train_data[:validation_size, ...]
    validation_labels = train_labels[:validation_size]
    train_data = train_data[validation_size:, ...]
    train_labels = train_labels[validation_size:]
    num_epochs = num_epochs
    train_size = train_labels.shape[0]

    lenet5 = LeNet5(config)

    x, y_ = lenet5.train_input_placeholders()
    y_conv, logits, keep_prob, param_dict = lenet5.model(x)

    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_))

    # L2 regularization for the fully connected parameters.
    regularizers = (tf.nn.l2_loss(param_dict['fc1_W']) +
                    tf.nn.l2_loss(param_dict['fc1_b']) +
                    tf.nn.l2_loss(param_dict['fc2_W']) +
                    tf.nn.l2_loss(param_dict['fc2_b']))
    # Add the regularization term to the loss.
    loss += 5e-4 * regularizers

    # Optimizer: set up a variable that's incremented once
    # per batch and controls the learning rate decay.
    batch = tf.Variable(0, dtype=tf.float32)

    # Decay once per epoch, using an exponential schedule starting at 0.01.
    learning_rate = tf.train.exponential_decay(
        0.01,
        batch * config.getint('main', 'batch_size'),
        train_size,
        0.95,
        staircase=True)

    optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) \
        .minimize(loss, global_step=batch)

    input_dict = {
        "x": x,
        "y_": y_,
        "y_conv": y_conv,
        "keep_prob": keep_prob,
        "train_data": train_data,
        "train_labels": train_labels,
        "test_data": test_data,
        "test_labels": test_labels,
        "validation_data": validation_data,
        "validation_labels": validation_labels,
        "num_epochs": num_epochs,
        "train_size": train_size
    }

    saver = tf.train.Saver(tf.all_variables())

    evaluator = Evaluator(cmd_args, config, optimizer, learning_rate, loss,
                          saver)
    evaluator.run(input_dict)

    fastgradientsign_advgen = FastGradientSign_AdvGen(cmd_args, [1, 28, 28, 1],
                                                      saver, config)
    adv_out_df = fastgradientsign_advgen.run(input_dict)

    pkl_path = config.get('main', 'pickle_filepath')
    utils.ensure_dir(os.path.dirname(pkl_path))
    with open(pkl_path, "wb") as pkl:
        pickle.dump(adv_out_df, pkl)
Пример #16
0
    def __init__(self,
                 model,
                 loss,
                 metrics,
                 resume,
                 config,
                 train_logger=None):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)
        self.model = model
        self.loss = loss
        self.metrics = metrics
        self.name = config['name']
        self.epochs = config['trainer']['epochs']
        self.save_freq = config['trainer']['save_freq']
        self.verbosity = config['trainer']['verbosity']
        self.summaryWriter = SummaryWriter()

        if tf.test.is_gpu_available():
            if config['cuda']:
                self.with_cuda = True
                self.gpus = {
                    i: item
                    for i, item in enumerate(self.config['gpus'])
                }
                device = 'cuda'
            else:
                self.with_cuda = False
                device = 'cpu'
        else:
            self.logger.warning(
                'Warning: There\'s no CUDA support on this machine, training is performed on CPU.'
            )
            self.with_cuda = False
            device = 'cpu'

        self.device = tf.device(device)
        self.model.to(self.device)

        self.logger.debug('Model is initialized.')
        self._log_memory_usage()

        self.train_logger = train_logger

        self.optimizer = self.model.optimize(config['optimizer_type'],
                                             config['optimizer'])

        self.lr_scheduler = getattr(keras.callbacks.LearningRateScheduler,
                                    config['lr_scheduler_type'], None)
        if self.lr_scheduler:
            self.lr_scheduler = self.lr_scheduler(self.optimizer,
                                                  **config['lr_scheduler'])
            self.lr_scheduler_freq = config['lr_scheduler_freq']
        self.monitor = config['trainer']['monitor']
        self.monitor_mode = config['trainer']['monitor_mode']
        assert self.monitor_mode == 'min' or self.monitor_mode == 'max'
        self.monitor_best = math.inf if self.monitor_mode == 'min' else -math.inf
        self.start_epoch = 1
        self.checkpoint_dir = os.path.join(config['trainer']['save_dir'],
                                           self.name)
        ensure_dir(self.checkpoint_dir)
        json.dump(config,
                  open(os.path.join(self.checkpoint_dir, 'config.json'), 'w'),
                  indent=4,
                  sort_keys=False)
        if resume:
            self._resume_checkpoint(resume)
Пример #17
0
    def run(self, input_dict):
        x = input_dict["x"]
        y_ = input_dict["y_"]
        y_conv = input_dict["y_conv"]
        keep_prob = input_dict["keep_prob"]
        train_data = input_dict["train_data"]
        train_labels = input_dict["train_labels"]
        test_data = input_dict["test_data"]
        test_labels = input_dict["test_labels"]
        validation_data = input_dict["validation_data"]
        validation_labels = input_dict["validation_labels"]
        num_epochs = input_dict["num_epochs"]
        train_size = input_dict["train_size"]

        batch_size = self.config.getint('main', 'batch_size')
        checkpoint_path = self.config.get('main', 'checkpoint_path')
        num_classes = self.config.getint('main', 'num_classes')
        eval_frequency = self.config.getint('main', 'eval_frequency')

        utils.ensure_dir(os.path.dirname(checkpoint_path))
        start_time = time.time()

        with tf.Session() as sess:
            tf.initialize_all_variables().run()
            print('Initialized!')

            if not self.cmd_args.restore_checkpoint:
                print('No checkpoint to load, training model from scratch...')

                if self.cmd_args.test:
                    iter_range = xrange(1)
                else:
                    iter_range = xrange(int(num_epochs * train_size) // batch_size)

                for step in iter_range:
                    offset = (step * batch_size) % (train_size - batch_size)
                    batch_data = train_data[offset:(offset + batch_size), ...]
                    batch_labels = train_labels[offset:(offset + batch_size)]

                    feed_dict = {
                        x: batch_data, 
                        y_: batch_labels,
                        keep_prob: 0.5
                    }

                    _, l, lr, predictions = sess.run(
                        [self.optimizer, self.loss, self.learning_rate, y_conv], feed_dict=feed_dict)

                    if step % eval_frequency == 0:
                        if not self.cmd_args.test:
                            path = self.saver.save(sess, checkpoint_path)
                            print("Saved model checkpoint to {}\n".format(path))
                        elapsed_time = time.time() - start_time
                        start_time = time.time()
                        print('Step %d (epoch %.2f), %.1f ms' %
                            (step, float(step) * batch_size / train_size,
                            1000 * elapsed_time / eval_frequency))
                        print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
                        print('Minibatch error: %.1f%%' % utils.error_rate(predictions, 
                                                                           batch_labels, 
                                                                           self.onehot_labels))
                        print('Validation error: %.1f%%' % utils.error_rate(
                            self.eval_in_batches(y_conv, 
                                                 x, 
                                                 keep_prob, 
                                                 validation_data, 
                                                 sess, 
                                                 batch_size, 
                                                 num_classes), validation_labels, self.onehot_labels))
                        sys.stdout.flush()
        
            # Finally print the result!
            test_error = utils.error_rate(self.eval_in_batches(y_conv, 
                                                               x, 
                                                               keep_prob, 
                                                               test_data, 
                                                               sess,
                                                               batch_size,
                                                               num_classes), test_labels, self.onehot_labels)
            print('Test error: %.1f%%' % test_error)
Пример #18
0
def main(trainingdir, model, num_epochs, size_batch_test, logdir, logdir_w,
         perform_one_hot, binarize_labels):

    global_step = tf.get_variable('global_step',
                                  dtype=tf.int32,
                                  initializer=0,
                                  trainable=False)

    train_list, valid_list, test_list = get_file_lists(trainingdir)

    label_input_size, label_output_size = get_tensor_size(
        perform_one_hot, binarize_labels)

    test_dataset = create_dataset(filenames=test_list,
                                  mode="testing",
                                  num_epochs=1,
                                  batch_size=size_batch_test,
                                  perform_one_hot=perform_one_hot,
                                  binarize_labels=binarize_labels)
    test_iterator = test_dataset.make_initializable_iterator()

    # Feedable iterator assigns each iterator a unique string handle it is going to work on
    handle = tf.placeholder(tf.string, shape=[])
    iterator = tf.data.Iterator.from_string_handle(handle,
                                                   test_dataset.output_types,
                                                   test_dataset.output_shapes)
    x, _ = iterator.get_next()

    x.set_shape([None, 192, 192, 4])
    x = tf.cast(x, tf.float32)

    training_placeholder = tf.placeholder(dtype=tf.bool,
                                          shape=[],
                                          name='training_placeholder')

    if model == "unet_keras":
        from models import unet_keras as model
        logits, logits_soft = model.unet(x, training_placeholder,
                                         label_output_size)
    elif model == "unet_tensorflow":
        from models import unet_tensorflow as model
        logits, logits_soft = model.unet(x,
                                         training=training_placeholder,
                                         norm_option=False,
                                         drop_val=0.5,
                                         label_output_size=label_output_size)

    ######################################## SUMMARIES #########################################################

    tf.summary.image('input_0', tf.expand_dims(x[:, :, :, 0], axis=-1))
    if label_output_size == 1:
        tf.summary.image('prediction',
                         tf.expand_dims(logits_soft[:, :, :, 0], axis=-1))
    elif label_output_size > 1:
        tf.summary.image("prediction", logits_soft[:, :, :, 1:])

    summary_test = tf.summary.merge_all()

    # op to write logs to Tensorboard
    logdir_w = os.path.expanduser(logdir_w)
    utils.ensure_dir(logdir_w)
    writer = tf.summary.FileWriter(logdir_w, graph=tf.get_default_graph())

    # Weight saver
    model_checkpoint_path = os.path.join(logdir, 'Checkpoint')
    saver = tf.train.Saver()

    ######################################## RUN SESSION #########################################################

    with tf.Session() as sess:

        # Initialize Variables
        #restore_weights:
        saver.restore(sess, tf.train.latest_checkpoint(logdir))

        test_handle = sess.run(test_iterator.string_handle())

        sess.run(test_iterator.initializer)

        try:
            while True:
                summary_val, logits_test = sess.run(
                    [summary_test, logits_soft],
                    feed_dict={
                        handle: test_handle,
                        training_placeholder: False
                    })

                writer.add_summary(summary_val)

        except tf.errors.OutOfRangeError:
            pass

    return
    pretrained=True).to(device)

# %%
criterion = functools.partial(F.cross_entropy, reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=0.001)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                          patience=15,
                                                          verbose=True)

# %%
tracked_metrics = [
    metrics.Accuracy(multiclass=True),
]

name = f'resnet18-pneumonia-classifier-s{seed}-3-classes-unprocessed'
utils.ensure_dir(f'logs/{vars.corda_version}/{name}')

# %%
best_model = trainer.fit(model=model,
                         train_dataloader=train_dataloader,
                         val_dataloader=val_dataloader,
                         test_dataloader=test_dataloader,
                         test_every=10,
                         criterion=criterion,
                         optimizer=optimizer,
                         scheduler=lr_scheduler,
                         metrics=tracked_metrics,
                         n_epochs=epochs,
                         name=name,
                         metric_choice=metric,
                         mode=mode,
Пример #20
0
def main(trainingdir, model, num_epochs, size_batch_train, size_batch_valid, step_metrics, steps_saver, learning_rate, logdir, restore_weights, perform_one_hot, binarize_labels):

    global_step=tf.get_variable('global_step',dtype=tf.int32,initializer=0,trainable=False)

    train_list, valid_list, _ = get_file_lists(trainingdir)

    label_input_size,label_output_size=get_tensor_size(perform_one_hot,binarize_labels)

    train_dataset = create_dataset(filenames=train_list,mode="training", num_epochs=1, batch_size=size_batch_train, perform_one_hot=perform_one_hot, binarize_labels=binarize_labels)
    train_iterator = train_dataset.make_initializable_iterator()
    validation_dataset = create_dataset(filenames=valid_list,mode="validation", num_epochs=1, batch_size=size_batch_valid, perform_one_hot=perform_one_hot, binarize_labels=binarize_labels)
    validation_iterator = validation_dataset.make_initializable_iterator()


    # Feedable iterator assigns each iterator a unique string handle it is going to work on
    handle = tf.placeholder(tf.string, shape = [])
    iterator = tf.data.Iterator.from_string_handle(handle, train_dataset.output_types, train_dataset.output_shapes)
    x, y = iterator.get_next()

    x.set_shape([None, 192, 192, 4])
    x = tf.cast(x, tf.float32)

    training_placeholder = tf.placeholder(dtype=tf.bool, shape=[], name='training_placeholder')

    if model == "unet_keras":
        from models import unet_keras as model
        logits, logits_soft = model.unet(x,training_placeholder,label_output_size)
    elif model == "unet_tensorflow":
        from models import unet_tensorflow as model
        logits, logits_soft = model.unet(x, training=training_placeholder, norm_option=False,drop_val=0.5,label_output_size=label_output_size)


    y.set_shape([None, 192, 192, label_input_size])
    y = tf.cast(y, tf.int32)


    if label_input_size>1: #OneHotEncoding
        loss_op= tf.reduce_mean(tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits))
        #tf.losses.softmax_cross_entropy(onehot_labels=y, logits=logits)
        #loss_op = tf.losses.get_total_loss(name='loss_op')
    else: #labelEncoding
        if label_output_size==1:
            loss_op=tf.reduce_mean(tf.losses.sigmoid_cross_entropy(multi_class_labels=y, logits=logits))
        else:
            loss_op= tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(labels=y, logits=logits))


    if label_input_size>1: #OneHotEncoding
        #Define the IoU metrics and update operations
        IoU_metrics, IoU_metrics_update = tf.metrics.mean_iou(labels=y, predictions=logits_soft, num_classes=label_input_size, name='my_metric_IoU')
        #Isolate the variables stored behind the scenes by the metric operation
        running_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="my_metric_IoU")
        # Define initializer to initialize/reset running variables
        running_vars_initializer = tf.variables_initializer(var_list=running_vars)


    optimizer = tf.train.AdamOptimizer(learning_rate)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    train_op = tf.group([update_ops, optimizer.minimize(loss_op,global_step=global_step)])

    # Weight saver
    model_checkpoint_path = os.path.join(logdir, 'Checkpoint')
    saver = tf.train.Saver()


    ######################################## SUMMARIES #########################################################

    tf.summary.image('input_0',tf.expand_dims(x[:,:,:,0],axis=-1))


    if label_input_size==1:
        tf.summary.image("labels",tf.cast(y,tf.float32))
    elif label_input_size>1:
        tf.summary.image('labels_0',tf.expand_dims(tf.cast(y,tf.float32)[:,:,:,0],axis=-1))
        tf.summary.image('labels_1',tf.expand_dims(tf.cast(y,tf.float32)[:,:,:,1],axis=-1))
        if label_input_size>2:
            tf.summary.image('labels_2',tf.expand_dims(tf.cast(y,tf.float32)[:,:,:,2],axis=-1))
            tf.summary.image('labels_3',tf.expand_dims(tf.cast(y,tf.float32)[:,:,:,3],axis=-1))


    if label_output_size==1:
        tf.summary.image('prediction',tf.expand_dims(logits_soft[:,:,:,0],axis=-1))
    elif label_output_size>1:
        tf.summary.image("prediction", logits_soft[:,:,:,1:])
        tf.summary.image('prediction_0',tf.expand_dims(logits_soft[:,:,:,0],axis=-1))
        tf.summary.image('prediction_1',tf.expand_dims(logits_soft[:,:,:,1],axis=-1))
        if label_output_size>2:
            tf.summary.image('prediction_2',tf.expand_dims(logits_soft[:,:,:,2],axis=-1))
            tf.summary.image('prediction_3',tf.expand_dims(logits_soft[:,:,:,3],axis=-1))

    #tf.summary.histogram("logits",logits)

    tf.summary.scalar("loss", loss_op)
    
    tf.summary.histogram("logits_soft",logits_soft)
    tf.summary.histogram("logits",logits)        

    summary_op=tf.summary.merge_all()

    # op to write logs to Tensorboard
    logdir = os.path.expanduser(logdir)
    utils.ensure_dir(logdir)
    writer = tf.summary.FileWriter(logdir, graph=tf.get_default_graph())
    writer_val =tf.summary.FileWriter(os.path.join(logdir, 'validation loss'), graph=tf.get_default_graph())


    ######################################## RUN SESSION #########################################################

    with tf.Session() as sess:

        # Initialize Variables
        if restore_weights:
            saver.restore(sess, tf.train.latest_checkpoint(logdir))
        else:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

        train_handle = sess.run(train_iterator.string_handle())
        validation_handle = sess.run(validation_iterator.string_handle())


        #training, validation and saving
        for epoch in range(num_epochs):
            sess.run(train_iterator.initializer)
            step=0
            try:
                while True:

                    #train
                    _,cost,summary_val,step_gl,logits_val,_ = sess.run([train_op,loss_op,summary_op,global_step,logits,logits_soft], feed_dict={handle: train_handle,training_placeholder: True})

                    writer.add_summary(summary_val,step_gl)

                    step += 1
                    print('\n Training step: Epoch {}, batch {} -- Loss: {:.3f}'.format(epoch+1, step, cost))



                    #validation
                    if step % step_metrics == 0:
                        total_validation_loss = [] #list where we will store the loss at each batch
                        sess.run(validation_iterator.initializer)
                        step_val=0
                        # initialize/reset the running variables of the IoU metrics
                        if label_input_size>1: #OneHotEncoding
                            sess.run(running_vars_initializer)

                        try:
                            print('\nPerforming validation')
                            while True:
                                if label_input_size>1: #OneHotEncoding
                                    cost_valid, _ = sess.run([loss_op, IoU_metrics_update], feed_dict={handle: validation_handle,training_placeholder: False})
                                else:
                                    cost_valid = sess.run([loss_op], feed_dict={handle: validation_handle,training_placeholder: False})
                                total_validation_loss.append(cost_valid)
                                step_val += 1
                                #print('\nValidation step: Epoch {}, batch {} -- Loss: {:.3f}'.format(epoch+1, step_val, cost_valid))
                        except tf.errors.OutOfRangeError:
                            pass
                        #loss
                        total_validation_loss = np.mean(total_validation_loss)
                        validation_loss_summary = tf.Summary(value=[tf.Summary.Value(tag="loss", simple_value=total_validation_loss)])
                        writer_val.add_summary(validation_loss_summary,step_gl)

                        #IoU metrics
                        if label_input_size>1: #OneHotEncoding
                            #IoU metrics
                            IoU_score = sess.run(IoU_metrics)
                            IoU_summary = tf.Summary(value=[tf.Summary.Value(tag="IoU_metrics", simple_value=IoU_score)])
                            writer.add_summary(IoU_summary,step_gl)
                            print('\n Epoch {} and training batch {} -- Validation loss {:.3f} and IoU metrics {:.3f}'.format(epoch+1, step,total_validation_loss, IoU_score))
                        else:
                            print('\n Epoch {} and training batch {} -- Validation loss {:.3f}'.format(epoch+1, step,total_validation_loss))

                    #saving
                    if step % steps_saver == 0:
                        print('\n Step {} Saving weights to {}'.format(step+1, model_checkpoint_path))
                        saver.save(sess, save_path=model_checkpoint_path,global_step=global_step)

            except tf.errors.OutOfRangeError:
                pass

    return