Пример #1
0
def predict_line(param):
    # 初始化日志对象
    logger = get_logger(param.test_log_file)
    tf_config = tf.ConfigProto()
    # 读取字典
    mapping_dict = get_dict(param.dict_file)
    # 根据保存的模型读取模型
    model = Model(param, mapping_dict)
    # 开始测试
    with tf.Session(config=tf_config) as sess:
        # 首先检查模型是否存在
        ckpt_path = param.ckpt_path
        ckpt = tf.train.get_checkpoint_state(ckpt_path)
        # 看是否存在训练好的模型
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            logger.info("Reading model parameters from {}".format(
                ckpt.model_checkpoint_path))
            # 如果存在就进行重新加载
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            logger.info("Cannot find the ckpt files!")
        while True:
            # 反复输入句子进行预测
            line = input("请输入测试句子:")
            raw_inputs, model_inputs = input_from_line_with_feature(line)
            tag = model.evaluate_line(sess, model_inputs)
            result = result_to_json(raw_inputs, tag)
            result = js.dumps(result,
                              ensure_ascii=False,
                              indent=4,
                              separators=(',', ': '))
            with open('./result/result.json', 'w', encoding='utf-8') as f:
                f.write(result)
            print("预测结果为:{}".format(result))
def run(args):
    # Set up logger
    logger = get_logger(args.verbosity)
    logger.info('[Loading input data]')

    # Load mutations
    mut_df = pd.read_csv(args.mutations_file, sep='\t')
    samples = list(OrderedDict.fromkeys(mut_df[PATIENT]))
    sample_index = dict(zip(samples, range(len(samples))))
    logger.info('- Loaded %s mutations from %s samples' %
                (len(mut_df), len(samples)))

    # Set up multiprocessing
    logger.info('[Classifying kataegis (%s+ mutations and IMD<=%s)]' %
                (KATAEGIS_MIN_MUT, KATAEGIS_IMD))
    from sklearn.externals.joblib import Parallel, delayed
    import multiprocessing
    available_cpus = multiprocessing.cpu_count()
    if args.n_cores == -1:
        n_cores = available_cpus
    else:
        n_cores = min(args.n_cores, available_cpus)

    # Classify kataegis
    def data_producer():
        for patient, patient_df in mut_df.groupby([PATIENT]):
            for chrom, chrom_df in patient_df.groupby([CHR]):
                imd = chrom_df[IMD].tolist()
                imd[0] = 0
                yield imd, patient, chrom

    def sort_patient_chrom(p, chrom):
        if chrom == 'X': return sample_index[p], 23
        elif chrom == 'Y': return sample_index[p], 24
        else: return sample_index[p], int(chrom)

    results = Parallel(n_jobs=n_cores,
                       verbose=10)(delayed(find_kataegis_loci)(imd, s, chrom)
                                   for imd, s, chrom in data_producer())
    results.sort(key=lambda k: sort_patient_chrom(k[1], k[2]))
    mut_df[KATAEGIS] = [
        c for sample_chrom_c, _, _ in results for c in sample_chrom_c
    ]

    logger.info('- Identified %s mutations participating in kataegis loci' %
                mut_df[KATAEGIS].sum())

    # Save updated mutations dataframe to file
    logger.info('[Outputting updated dataframe to file]')
    logger.info('- Writing table to %s' % args.output_file)
    mut_df.to_csv(args.output_file, sep='\t', index=False)
Пример #3
0
def run(args):
    # Set up logger
    logger = get_logger(args.verbosity)
    logger.info('[Loading input data]')

    # Load the signatures
    sig_df = pd.read_csv(args.signatures_file, sep='\t', index_col=0)
    categories = list(sig_df.columns)
    category_index = dict(zip(categories, range(len(categories))))

    logger.info('- Loaded %s x %s signature matrix' % sig_df.values.shape)

    # Load the mutations
    mut_df = pd.read_csv(args.mutations_file,
                         sep='\t',
                         usecols=[PATIENT, CATEGORY, MUT_DIST])
    samples = sorted(set(mut_df[PATIENT]))

    logger.info('- Loaded %s mutations in %s samples' %
                (len(mut_df), len(samples)))

    # Add the category index and create sequences of mutations
    logger.info('[Processing data into SigMa format]')
    mut_df[CATEGORY_IDX] = mut_df.apply(lambda r: category_index[r[CATEGORY]],
                                        axis='columns')

    sampleToSequence = dict((s, list(map(int, s_df[CATEGORY_IDX])))
                            for s, s_df in mut_df.groupby([PATIENT]))
    sampleToPrevMutDists = dict((s, list(map(float, s_df[MUT_DIST])))
                                for s, s_df in mut_df.groupby([PATIENT]))

    # Save to JSON
    logger.info('- Saving to file %s' % args.output_file)
    with open(args.output_file, 'w') as OUT:
        output = dict(sampleToSequence=sampleToSequence,
                      sampleToPrevMutDists=sampleToPrevMutDists,
                      samples=samples,
                      categories=categories,
                      params=vars(args))
        json.dump(output, OUT)
Пример #4
0
def test(param):
    # 检查参数
    assert param.clip < 5.1, "gradient clip should't be too much"
    assert 0 <= param.dropout < 1, "dropout rate between 0 and 1"
    assert param.lr > 0, "learning rate must larger than zero"
    # 获取batch_manager
    test_manager = BatchManager(param.test_batch_size, name='test')
    number_dataset = test_manager.len_data
    print("total of number test data is {}".format(number_dataset))
    # 配置日志
    logger = get_logger(param.test_log_file)
    # 读取字典
    mapping_dict = get_dict(param.dict_file)
    # 搭建模型
    model = Model(param, mapping_dict)
    # 配置GPU参数
    gpu_config = tf.ConfigProto()
    with tf.Session(config=gpu_config) as sess:
        logger.info("start testing...")
        start = time.time()
        # 首先检查模型是否存在
        ckpt_path = param.ckpt_path
        ckpt = tf.train.get_checkpoint_state(ckpt_path)
        # 看是否存在训练好的模型
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            logger.info("Reading model parameters from {}".format(
                ckpt.model_checkpoint_path))
            # 如果存在就进行重新加载
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            logger.info("Cannot find the ckpt files!")
        # 开始评估
        evaluate(sess, param, model, "test", test_manager, logger)
        logger.info("The best_f1 on test_dataset is {:.2f}".format(
            model.best_test_f1.eval()))
        logger.info('Time test for {:.2f} batch is {:.2f} sec\n'.format(
            param.test_batch_size,
            time.time() - start))
Пример #5
0
    def __init__(self, name, word_to_id, id_to_tag, parameters):

        self.logger = get_logger(name)
        self.params = parameters
        self.num_words = len(word_to_id)
        self.learning_rate = self.params.lr
        self.global_step = tf.Variable(0, trainable=False)
        self.initializer = tf.contrib.layers.xavier_initializer
        self.tags = [tag for i, tag in id_to_tag.items()]
        self.tag_num = len(self.tags)

        # add placeholders for the model
        self.inputs = tf.placeholder(dtype=tf.int32,
                                     shape=[None, self.params.word_max_len],
                                     name="Inputs")
        self.labels = tf.placeholder(dtype=tf.int32,
                                     shape=[None, self.params.word_max_len],
                                     name="Labels")
        # seq-len
        self.lengths = tf.placeholder(dtype=tf.int32,
                                      shape=[None],
                                      name="Lengths")
        if self.params.feature_dim:
            self.features = tf.placeholder(dtype=tf.float32,
                                           shape=[
                                               None, self.params.word_max_len,
                                               self.params.feature_dim
                                           ],
                                           name="Features")
        self.dropout = tf.placeholder(dtype=tf.float32, name="Dropout")

        # get embedding of input sequence
        embedding = self.get_embedding(self.inputs, word_to_id)
        # apply dropout on embedding
        rnn_inputs = tf.nn.dropout(embedding, self.dropout)

        # concat extra features with embedding
        if self.params.feature_dim:
            rnn_inputs = tf.concat([rnn_inputs, self.features], 2)
            # rnn_inputs = tf.concat(2, [rnn_inputs, self.features])

        # extract features
        rnn_features = self.bilstm_layer(rnn_inputs)

        # projection layer
        self.scores = self.project_layer(rnn_features, self.tag_num)

        # calculate loss of crf layer
        self.trans, self.loss = self.loss_layer(self.scores, self.tag_num)

        # optimizer of the model
        self.opt = tf.train.AdamOptimizer(self.learning_rate)

        # apply grad clip to avoid gradient explosion
        grads_vars = self.opt.compute_gradients(self.loss)

        capped_grads_vars = [(tf.clip_by_value(g, -self.params.clip,
                                               self.params.clip), v)
                             for g, v in grads_vars]  # gradient capping
        self.train_op = self.opt.apply_gradients(capped_grads_vars,
                                                 self.global_step)
        self.saver = tf.train.Saver(tf.global_variables())
Пример #6
0
def run(args):
    # Set up logger
    logger = get_logger(args.verbosity)
    logger.info('[Loading input data]')

    # Load the signatures
    sig_df = pd.read_csv(args.signatures_file, sep='\t', index_col=0)
    categories = list(sig_df.columns)
    category_index = dict(zip(categories, range(len(categories))))

    logger.info('- Loaded %s x %s signature matrix' % sig_df.values.shape)

    # Load the mutations
    use_strand_cols = list(set(STRAND_COLUMNS) - {SAME_ALLELE, SAME_STRAND})
    mut_df = pd.read_csv(args.mutations_file,
                         sep='\t',
                         usecols=[PATIENT, CATEGORY, REF, CHR, KATAEGIS] +
                         use_strand_cols)
    samples = sorted(set(mut_df[PATIENT]))

    logger.info('- Loaded %s mutations in %s samples' %
                (len(mut_df), len(samples)))

    # Add the category index and create sequences of mutations
    logger.info('[Processing data into sMMM format]')
    mut_df[CATEGORY_IDX] = mut_df.apply(lambda r: category_index[r[CATEGORY]],
                                        axis='columns')
    mut_df[SAME_ALLELE] = mut_df[REF]
    #mut_df[SAME_STRAND] = mut_df[REF].replace({REF: {'A': 0, 'C': 1, 'G': 0, 'T': 1}})
    mut_df[SAME_STRAND] = mut_df[REF]
    mut_df = mut_df.replace({SAME_STRAND: {'A': 0, 'C': 1, 'G': 0, 'T': 1}})

    sampleToStrandMatch = {s: {} for s in samples}
    sampleToKataegis = {}
    for s, sample_df in mut_df.groupby([PATIENT]):
        # Go sample by sample to figure out when adjacent mutations match
        sampleToKataegis[s] = sample_df[KATAEGIS].tolist()
        for col in STRAND_COLUMNS:
            # Compare adjacent mutations
            xs = np.array(sample_df[col].tolist())
            if col in {SAME_ALLELE, SAME_STRAND}:
                matched = xs == np.roll(xs, 1)
            else:
                # Convert NaNs to -1 because otherwise they could
                # break the comparison
                xs[np.isnan(xs)] = 0
                xs = xs.astype(bool)
                matched = xs & np.roll(xs, 1)

            # Finally, set the first position to False (to be safe)
            matched[0] = False

            sampleToStrandMatch[s][col] = matched.astype(int).tolist()

        sampleToStrandMatch[s][NO_STRAND] = [0] + [1] * (len(sample_df) - 1)

    sampleToSequence = dict((s, list(map(int, s_df[CATEGORY_IDX])))
                            for s, s_df in mut_df.groupby([PATIENT]))
    chromosomeMap = dict(
        (s, list(s_df[CHR])) for s, s_df in mut_df.groupby([PATIENT]))

    # Save to JSON
    logger.info('- Saving to file %s' % args.output_file)
    with open(args.output_file, 'w') as OUT:
        output = dict(sampleToSequence=sampleToSequence,
                      sampleToStrandMatch=sampleToStrandMatch,
                      sampleToKataegis=sampleToKataegis,
                      chromosomeMap=chromosomeMap,
                      samples=samples,
                      categories=categories,
                      params=vars(args))
        json.dump(output, OUT)
Пример #7
0
def main(args):
    """
    The main function to reproducing the paper's results
    :param command: 'loo' for leave one out. 'viterbi' for viterbi
    :param model: 'sigma' or 'mmm'
    :param batch: Takes indices from batch * batch_size to (batch + 1) * batch_size
    :param batch_size: see batch
    :param threshold: Define maximal distance (in bp) in clouds. Use 0 to not split (i.e use whole chromosomes)
    :param max_iterations: Maximal number of iterations for training the model
    :param epsilon: Minimum improvement in every iteration of the model, if improvement is lower stop training
    :param random_state: Random state to initialize the models
    :param out_dir: where to save all the files
    :return:
    """
    # Create simple logger
    logger = get_logger(args.verbosity)
    logger.info('[Loading input data]')

    # Get the list of samples we're going to run on
    with open(args.mutations_file, 'r') as IN:
        obj = json.load(IN)
        samples = obj.get('samples')
        categories = obj.get('categories')

    if len(args.sample_names) == 0:
        sample_indices = range(len(samples))
    else:
        sample_indices = [samples.index(s) for s in args.sample_names]

    logger.info('- Loading data for %s samples' % len(sample_indices))

    # Load the emissions matrix
    sig_df = pd.read_csv(args.signatures_file, sep='\t', index_col=0)
    emissions = sig_df.values
    if len(args.active_signatures) > 0:
        emissions = emissions[np.array(args.active_signatures) - 1]

    assert (list(sig_df.columns) == categories)

    logger.info('- Loaded %s x %s emissions matrix' % emissions.shape)
    # if threshold <= 0:
    #     out_dir_for_file = os.path.join(out_dir, model)
    #     threshold = 1e99
    # else:
    #     out_dir_for_file = os.path.join(out_dir, model + '_' + str(threshold))

    experiment_tuples = get_split_sequences_by_threshold(
        args.mutations_file, args.cloud_threshold, sample_indices)

    # Perform the experiments
    logger.info('[Performing experiments]')
    if args.cross_validation_mode:
        logger.info('- Cross-validation mode')
        func = leave_one_out
    else:
        logger.info('- Viterbi mode')
        func = get_viterbi

    for experiment_tuple in tqdm(experiment_tuples,
                                 total=len(sample_indices),
                                 ncols=80):
        sample = experiment_tuple[0]
        out_file = '%s/%s-%s' % (args.output_directory, args.model_name,
                                 sample)
        dict_to_save = func(experiment_tuple, args.model_name, emissions,
                            args.max_iter, args.tolerance, args.random_state)
        to_json(out_file, dict_to_save)

    logger.info('- Done')
Пример #8
0
def run(args):
    # Set up logger
    logger = get_logger(args.verbosity)
    logger.info('[Loading input data]')

    # Load the signatures
    sig_df = pd.read_csv(args.signatures_file, sep='\t', index_col=0)
    categories = list(sig_df.columns)
    category_index = dict(zip(categories, range(len(categories))))

    logger.info('- Loaded %s x %s signature matrix' % sig_df.values.shape)

    # Load the mutations
    mut_df = pd.read_csv(
        args.mutations_file,
        sep='\t',
        usecols=[PATIENT, CATEGORY, MUT_DIST, CHROMOSOME, START_POS],
        dtype={
            PATIENT: str,
            CATEGORY: str,
            MUT_DIST: np.float,
            CHROMOSOME: str,
            START_POS: int
        })
    samples = sorted(set(mut_df[PATIENT]))

    logger.info('- Loaded %s mutations in %s samples' %
                (len(mut_df), len(samples)))

    # If a mappability blacklist is provided, use it remove mutations
    if not (args.mappability_blacklist_file is None):
        # Load the dataframe and process into a dictionary
        mappability_df = pd.read_csv(args.mappability_blacklist_file, sep=',')
        chrom_idx, start_idx, stop_idx = mappability_df.columns[:3]

        map_blacklist = defaultdict(list)
        unmappable_bases = 0
        for _, r in mappability_df.iterrows():
            chrom = r[chrom_idx][3:]
            map_blacklist[chrom].append((int(r[start_idx]), int(r[stop_idx])))
            unmappable_bases += map_blacklist[chrom][-1][1] - map_blacklist[
                chrom][-1][0]

        logger.info(
            '- Loaded unmappable regions spanning %s bases in %s chromosomes' %
            (unmappable_bases, len(map_blacklist)))

        # Restrict mutations that fall in a blacklisted region
        logger.info('[Removing unmappable mutations]')

        def mappable(r):
            for start, stop in map_blacklist[r[CHROMOSOME]]:
                if start <= r[START_POS] <= stop:
                    return False
            return True

        n_muts = len(mut_df)
        mut_df = mut_df[mut_df.apply(mappable, axis='columns')]

        n_unmappable = n_muts - len(mut_df)
        logger.info('\t-> Removed %s mutations that were not mappable' %
                    n_unmappable)

    # Add the category index and create sequences of mutations
    logger.info('[Processing data into SigMa format]')
    mut_df[CATEGORY_IDX] = mut_df.apply(lambda r: category_index[r[CATEGORY]],
                                        axis='columns')

    sampleToSequence = dict((s, list(map(int, s_df[CATEGORY_IDX])))
                            for s, s_df in mut_df.groupby([PATIENT]))
    sampleToPrevMutDists = dict((s, list(map(float, s_df[MUT_DIST])))
                                for s, s_df in mut_df.groupby([PATIENT]))

    # Save to JSON
    logger.info('- Saving to file %s' % args.output_file)
    with open(args.output_file, 'w') as OUT:
        output = dict(sampleToSequence=sampleToSequence,
                      sampleToPrevMutDists=sampleToPrevMutDists,
                      samples=samples,
                      categories=categories,
                      params=vars(args))
        json.dump(output, OUT)
Пример #9
0
def train(param):
    # 检查参数
    assert param.clip < 5.1, "gradient clip should't be too much"
    assert 0 <= param.dropout < 1, "dropout rate between 0 and 1"
    assert param.lr > 0, "learning rate must larger than zero"
    # 数据准备
    train_manager = BatchManager(param.batch_size, name='train')
    number_dataset = train_manager.len_data
    print("total of number train data is {}".format(number_dataset))
    # 创建相应的文件夹
    make_path(param)
    # 配置日志
    logger = get_logger(param.train_log_file)
    # 读取字典
    mapping_dict = get_dict(param.dict_file)
    # 读取senc_tag为后续加载词向量做准备
    senc_tag = get_sent_tag(param.sent_tag_file)
    # 加载预训练向量
    dico_chars, char_to_id, id_to_char = augment_with_pretrained(
        mapping_dict['word'][2].copy(), param.emb_file,
        list(
            itertools.chain.from_iterable([[w[0] for w in s]
                                           for s in senc_tag])))
    # 获取总的训练集数据数量
    steps_per_epoch = train_manager.len_data
    # 配置GPU参数
    gpu_config = tf.ConfigProto()
    with tf.Session(config=gpu_config) as sess:
        # 初始化模型
        model = creat_model(sess,
                            Model,
                            param.ckpt_path,
                            load_word2vec,
                            param,
                            id_to_char,
                            logger,
                            map_all=mapping_dict)
        for i in range(param.max_epoch):
            loss = []
            total_loss = 0
            # 初始化时间
            start = time.time()
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, batch)
                # 这里计算平均loss
                loss.append(batch_loss)
                # 这里计算总的loss后面计算全部平均
                total_loss += batch_loss
                if step % 5 == 0:
                    logger.info(
                        "epoch:{}, step:{}/{}, avg_loss:{:>9.4f}".format(
                            i + 1, step % steps_per_epoch, steps_per_epoch,
                            np.mean(loss)))
            # 保存模型
            model.save_model(sess, logger, i)
            logger.info('Epoch {}, total Loss {:.4f}'.format(
                i + 1, total_loss / train_manager.len_data))
            logger.info(
                'Time taken for one epoch {:.4f} min, take {:.2f} h for rest of epoch\n'
                .format((time.time() - start) / 60,
                        ((param.max_epoch - i + 1) *
                         (time.time() - start)) / 3600))
Пример #10
0
def run(args):
    # Set up logger
    logger = get_logger(args.verbosity)
    logger.info('[Loading input data]')

    # Load strand information and process into dictionaries
    strand_df = pd.read_csv(args.strand_info_file, sep='\t')
    strand_keys = list(zip(strand_df['tChr'], strand_df['tPos0']))
    is_left = dict(zip(strand_keys, map(bool, strand_df['tIsLeft'])))
    is_right = dict(zip(strand_keys, map(bool, strand_df['tIsRight'])))
    is_tx_plus = dict(zip(strand_keys, map(bool, strand_df['tTxPlus'])))
    is_tx_minus = dict(zip(strand_keys, map(bool, strand_df['tTxMinus'])))

    logger.info('- Loaded strand information in %s bins' % len(strand_df))

    # Define the main categorization by strand function. We return six categories:
    # 1) Lagging strand
    # 2) Leading strand
    # 3) Template (genic regions only)
    # 4) Non template
    # 5) Transcription and replication in the same direction (genic regions only)
    # 6) Transcription and replication in opposite directions
    def categorize_by_stand(mut):
        # Get the chromosome and bin start positions
        chrom = 'chr%s' % mut[CHR]
        pos = mut[POS_START]
        bin_start = np.floor(pos / 20000) * 20000

        strand_key = sk = (chrom, bin_start)

        is_ref_plus = mut[REF] in {'C', 'T'}
        is_ref_minus = not is_ref_plus

        # Classify leading/lagging
        if not (is_left[sk] or is_right[sk]):
            leading = np.nan
            lagging = np.nan
        elif (is_ref_plus and is_left[sk]) or (is_ref_minus and is_right[sk]):
            lagging = 0
            leading = 1
        else:
            lagging = 1
            leading = 0

        # Classify template/non-template
        if not (is_tx_plus[sk] or is_tx_minus[sk]):
            template = np.nan
            non_template = np.nan
        elif (is_ref_plus and is_tx_plus[sk]) or (is_ref_minus
                                                  and is_tx_minus[sk]):
            template = 0
            non_template = 1
        else:
            template = 1
            non_template = 0

        # Classify tx/rep as same/opposite
        if not (is_left[sk] or is_right[sk]) or not (is_tx_plus[sk]
                                                     or is_tx_minus[sk]):
            rep_tx_same = np.nan
            rep_tx_opposite = np.nan
        elif (is_right[sk] and is_tx_plus[sk]) or (is_left[sk]
                                                   and is_tx_minus[sk]):
            rep_tx_same = 1
            rep_tx_opposite = 0
        else:
            rep_tx_same = 0
            rep_tx_opposite = 1

        return leading, lagging, template, non_template, rep_tx_same, rep_tx_opposite

    # Load mutations
    mut_df = pd.read_csv(args.mutations_file, sep='\t')
    columns = mut_df.columns
    samples = set(mut_df[PATIENT])
    logger.info('- Loaded %s mutations from %s samples' %
                (len(mut_df), len(samples)))

    # Add strand information
    logger.info('[Adding strand information]')
    strand_categories = list(
        zip(*[categorize_by_stand(m) for _, m in mut_df.iterrows()]))
    strand_category_names = [
        LEADING, LAGGING, TEMPLATE, NON_TEMPLATE, REP_TX_SAME_DIR,
        REP_TX_OPP_DIR
    ]
    for i, strand_cat in enumerate(strand_category_names):
        mut_df[strand_cat] = strand_categories[i]
        logger.info('- %s mutations "%s"' %
                    (np.nansum(strand_categories[i]), strand_cat))

    # Save updated mutations dataframe to file
    logger.info('[Outputting updated dataframe to file]')
    logger.info('- Writing table to %s' % args.output_file)
    mut_df = mut_df[columns.tolist() + strand_category_names]
    mut_df.to_csv(args.output_file, sep='\t', index=False)
Пример #11
0
# FIXME
# 1. change save_dir working
# 2. load_emb : increase vocab size
# 3. include out_predict

args = utils.parse_args()

LOG_FILE = args.save_dir + "/log_file"

# set up logging
# logger will output to both sys.stdout and
# to log_file in save_dir
logger = utils.get_logger("Bi-LSTM_CNN_CRF", LOG_FILE)
logger.info("#" * 50)
data_utils.get_logger(LOG_FILE)
collection.get_log_file_path(LOG_FILE)

# generate data set for training
data = data_utils.load_data(args)
train_char_x = data["char"][0]
train_word_x = data["train_word"][0]
train_word_y = np.expand_dims(data["train_word"][1], -1)
train_orth_char_x = data["orth_char"][0]
train_orth_x = data["train_orth"]
n_train_examples = len(train_word_x)

dev_char_x = data["char"][1]
dev_word_x = data["dev_word"][0]
dev_word_y = data["dev_word"][1]
dev_mask = data["dev_word"][2]
Пример #12
0
from __future__ import print_function, division
import os

import time
import argparse
from glob import glob

from tensorflow import keras
import numpy as np

from image_similarity_measures.quality_metrics import psnr, uiq, sam, sre

from data_utils import get_logger
from patches import recompose_images, OpenDataFilesTest, OpenDataFiles

logger = get_logger(__name__)

SCALE = 2000
MODEL_PATH = "../models/"


def write_final_dict(metric, metric_dict):
    # Create a directory to save the text file of including evaluation values.
    predict_path = "val_predict/"
    if not os.path.exists(predict_path):
        os.makedirs(predict_path)

    with open(os.path.join(predict_path, metric + '.txt'), 'w') as f:
        f.writelines('{}:{}\n'.format(k, v) for k, v in metric_dict.items())

Пример #13
0
def test():
    """
    Function to test of the models
    :return:
    """
    models = list()
    for i in range(utils.num_folds):
        models.append(
            load_model(model_utils.model_filepath.format(i),
                       custom_objects={'mywloss': mywloss}))

    import time
    start = time.time()
    remain_df = None

    def the_unique(x):
        return [x[i] for i in range(len(x)) if x[i] != x[i - 1]]

    for i_c, df in enumerate(
            pd.read_csv(utils.test_data_filepath,
                        chunksize=utils.test_chunksize,
                        iterator=True)):
        unique_ids = the_unique(df['object_id'].tolist())
        new_remain_df = df.loc[df['object_id'] == unique_ids[-1]].copy()

        if remain_df is None:
            df = df.loc[df['object_id'].isin(unique_ids[:-1])].copy()
        else:
            df = pd.concat(
                [remain_df, df.loc[df['object_id'].isin(unique_ids[:-1])]],
                axis=0)

        # Create remaining samples df
        remain_df = new_remain_df

        preds_np_arr = predict_chunk(df_=df, models=models)
        preds_df = pd.DataFrame(data=preds_np_arr)
        print('Shape of predictions: {}'.format(preds_np_arr.shape))

        if i_c == 0:
            preds_df.to_csv(utils.predictions_file,
                            header=False,
                            index=False,
                            float_format='%.6f')
        else:
            preds_df.to_csv(utils.predictions_file,
                            header=False,
                            mode='a',
                            index=False,
                            float_format='%.6f')

        del preds_np_arr, preds_df
        gc.collect()

        if (i_c + 1) % 10 == 0:
            utils.get_logger().info('%15d done in %5.1f' %
                                    (utils.test_chunksize * (i_c + 1),
                                     (time.time() - start) / 60))
            print('%15d done in %5.1f' % (utils.test_chunksize * (i_c + 1),
                                          (time.time() - start) / 60))
    # Compute last object in remain_df

    preds_np_arr = predict_chunk(df_=remain_df, models=models)
    preds_df = pd.DataFrame(data=preds_np_arr)
    preds_df.to_csv(utils.predictions_file,
                    header=False,
                    mode='a',
                    index=False,
                    float_format='%.6f')
    z = pd.read_csv(utils.predictions_file)
    z = z.groupby('object_id').mean()
    z.to_csv(utils.final_predictions_file, index=True, float_format='%.6f')
Пример #14
0
        gc.collect()

        if (i_c + 1) % 10 == 0:
            utils.get_logger().info('%15d done in %5.1f' %
                                    (utils.test_chunksize * (i_c + 1),
                                     (time.time() - start) / 60))
            print('%15d done in %5.1f' % (utils.test_chunksize * (i_c + 1),
                                          (time.time() - start) / 60))
    # Compute last object in remain_df

    preds_np_arr = predict_chunk(df_=remain_df, models=models)
    preds_df = pd.DataFrame(data=preds_np_arr)
    preds_df.to_csv(utils.predictions_file,
                    header=False,
                    mode='a',
                    index=False,
                    float_format='%.6f')
    z = pd.read_csv(utils.predictions_file)
    z = z.groupby('object_id').mean()
    z.to_csv(utils.final_predictions_file, index=True, float_format='%.6f')


if __name__ == '__main__':
    gc.enable()
    utils.create_logger()
    try:
        test()
    except Exception:
        utils.get_logger().exception('Unexpected Exception Occurred')
        raise