def predict_line(param): # 初始化日志对象 logger = get_logger(param.test_log_file) tf_config = tf.ConfigProto() # 读取字典 mapping_dict = get_dict(param.dict_file) # 根据保存的模型读取模型 model = Model(param, mapping_dict) # 开始测试 with tf.Session(config=tf_config) as sess: # 首先检查模型是否存在 ckpt_path = param.ckpt_path ckpt = tf.train.get_checkpoint_state(ckpt_path) # 看是否存在训练好的模型 if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): logger.info("Reading model parameters from {}".format( ckpt.model_checkpoint_path)) # 如果存在就进行重新加载 model.saver.restore(sess, ckpt.model_checkpoint_path) else: logger.info("Cannot find the ckpt files!") while True: # 反复输入句子进行预测 line = input("请输入测试句子:") raw_inputs, model_inputs = input_from_line_with_feature(line) tag = model.evaluate_line(sess, model_inputs) result = result_to_json(raw_inputs, tag) result = js.dumps(result, ensure_ascii=False, indent=4, separators=(',', ': ')) with open('./result/result.json', 'w', encoding='utf-8') as f: f.write(result) print("预测结果为:{}".format(result))
def run(args): # Set up logger logger = get_logger(args.verbosity) logger.info('[Loading input data]') # Load mutations mut_df = pd.read_csv(args.mutations_file, sep='\t') samples = list(OrderedDict.fromkeys(mut_df[PATIENT])) sample_index = dict(zip(samples, range(len(samples)))) logger.info('- Loaded %s mutations from %s samples' % (len(mut_df), len(samples))) # Set up multiprocessing logger.info('[Classifying kataegis (%s+ mutations and IMD<=%s)]' % (KATAEGIS_MIN_MUT, KATAEGIS_IMD)) from sklearn.externals.joblib import Parallel, delayed import multiprocessing available_cpus = multiprocessing.cpu_count() if args.n_cores == -1: n_cores = available_cpus else: n_cores = min(args.n_cores, available_cpus) # Classify kataegis def data_producer(): for patient, patient_df in mut_df.groupby([PATIENT]): for chrom, chrom_df in patient_df.groupby([CHR]): imd = chrom_df[IMD].tolist() imd[0] = 0 yield imd, patient, chrom def sort_patient_chrom(p, chrom): if chrom == 'X': return sample_index[p], 23 elif chrom == 'Y': return sample_index[p], 24 else: return sample_index[p], int(chrom) results = Parallel(n_jobs=n_cores, verbose=10)(delayed(find_kataegis_loci)(imd, s, chrom) for imd, s, chrom in data_producer()) results.sort(key=lambda k: sort_patient_chrom(k[1], k[2])) mut_df[KATAEGIS] = [ c for sample_chrom_c, _, _ in results for c in sample_chrom_c ] logger.info('- Identified %s mutations participating in kataegis loci' % mut_df[KATAEGIS].sum()) # Save updated mutations dataframe to file logger.info('[Outputting updated dataframe to file]') logger.info('- Writing table to %s' % args.output_file) mut_df.to_csv(args.output_file, sep='\t', index=False)
def run(args): # Set up logger logger = get_logger(args.verbosity) logger.info('[Loading input data]') # Load the signatures sig_df = pd.read_csv(args.signatures_file, sep='\t', index_col=0) categories = list(sig_df.columns) category_index = dict(zip(categories, range(len(categories)))) logger.info('- Loaded %s x %s signature matrix' % sig_df.values.shape) # Load the mutations mut_df = pd.read_csv(args.mutations_file, sep='\t', usecols=[PATIENT, CATEGORY, MUT_DIST]) samples = sorted(set(mut_df[PATIENT])) logger.info('- Loaded %s mutations in %s samples' % (len(mut_df), len(samples))) # Add the category index and create sequences of mutations logger.info('[Processing data into SigMa format]') mut_df[CATEGORY_IDX] = mut_df.apply(lambda r: category_index[r[CATEGORY]], axis='columns') sampleToSequence = dict((s, list(map(int, s_df[CATEGORY_IDX]))) for s, s_df in mut_df.groupby([PATIENT])) sampleToPrevMutDists = dict((s, list(map(float, s_df[MUT_DIST]))) for s, s_df in mut_df.groupby([PATIENT])) # Save to JSON logger.info('- Saving to file %s' % args.output_file) with open(args.output_file, 'w') as OUT: output = dict(sampleToSequence=sampleToSequence, sampleToPrevMutDists=sampleToPrevMutDists, samples=samples, categories=categories, params=vars(args)) json.dump(output, OUT)
def test(param): # 检查参数 assert param.clip < 5.1, "gradient clip should't be too much" assert 0 <= param.dropout < 1, "dropout rate between 0 and 1" assert param.lr > 0, "learning rate must larger than zero" # 获取batch_manager test_manager = BatchManager(param.test_batch_size, name='test') number_dataset = test_manager.len_data print("total of number test data is {}".format(number_dataset)) # 配置日志 logger = get_logger(param.test_log_file) # 读取字典 mapping_dict = get_dict(param.dict_file) # 搭建模型 model = Model(param, mapping_dict) # 配置GPU参数 gpu_config = tf.ConfigProto() with tf.Session(config=gpu_config) as sess: logger.info("start testing...") start = time.time() # 首先检查模型是否存在 ckpt_path = param.ckpt_path ckpt = tf.train.get_checkpoint_state(ckpt_path) # 看是否存在训练好的模型 if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): logger.info("Reading model parameters from {}".format( ckpt.model_checkpoint_path)) # 如果存在就进行重新加载 model.saver.restore(sess, ckpt.model_checkpoint_path) else: logger.info("Cannot find the ckpt files!") # 开始评估 evaluate(sess, param, model, "test", test_manager, logger) logger.info("The best_f1 on test_dataset is {:.2f}".format( model.best_test_f1.eval())) logger.info('Time test for {:.2f} batch is {:.2f} sec\n'.format( param.test_batch_size, time.time() - start))
def __init__(self, name, word_to_id, id_to_tag, parameters): self.logger = get_logger(name) self.params = parameters self.num_words = len(word_to_id) self.learning_rate = self.params.lr self.global_step = tf.Variable(0, trainable=False) self.initializer = tf.contrib.layers.xavier_initializer self.tags = [tag for i, tag in id_to_tag.items()] self.tag_num = len(self.tags) # add placeholders for the model self.inputs = tf.placeholder(dtype=tf.int32, shape=[None, self.params.word_max_len], name="Inputs") self.labels = tf.placeholder(dtype=tf.int32, shape=[None, self.params.word_max_len], name="Labels") # seq-len self.lengths = tf.placeholder(dtype=tf.int32, shape=[None], name="Lengths") if self.params.feature_dim: self.features = tf.placeholder(dtype=tf.float32, shape=[ None, self.params.word_max_len, self.params.feature_dim ], name="Features") self.dropout = tf.placeholder(dtype=tf.float32, name="Dropout") # get embedding of input sequence embedding = self.get_embedding(self.inputs, word_to_id) # apply dropout on embedding rnn_inputs = tf.nn.dropout(embedding, self.dropout) # concat extra features with embedding if self.params.feature_dim: rnn_inputs = tf.concat([rnn_inputs, self.features], 2) # rnn_inputs = tf.concat(2, [rnn_inputs, self.features]) # extract features rnn_features = self.bilstm_layer(rnn_inputs) # projection layer self.scores = self.project_layer(rnn_features, self.tag_num) # calculate loss of crf layer self.trans, self.loss = self.loss_layer(self.scores, self.tag_num) # optimizer of the model self.opt = tf.train.AdamOptimizer(self.learning_rate) # apply grad clip to avoid gradient explosion grads_vars = self.opt.compute_gradients(self.loss) capped_grads_vars = [(tf.clip_by_value(g, -self.params.clip, self.params.clip), v) for g, v in grads_vars] # gradient capping self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step) self.saver = tf.train.Saver(tf.global_variables())
def run(args): # Set up logger logger = get_logger(args.verbosity) logger.info('[Loading input data]') # Load the signatures sig_df = pd.read_csv(args.signatures_file, sep='\t', index_col=0) categories = list(sig_df.columns) category_index = dict(zip(categories, range(len(categories)))) logger.info('- Loaded %s x %s signature matrix' % sig_df.values.shape) # Load the mutations use_strand_cols = list(set(STRAND_COLUMNS) - {SAME_ALLELE, SAME_STRAND}) mut_df = pd.read_csv(args.mutations_file, sep='\t', usecols=[PATIENT, CATEGORY, REF, CHR, KATAEGIS] + use_strand_cols) samples = sorted(set(mut_df[PATIENT])) logger.info('- Loaded %s mutations in %s samples' % (len(mut_df), len(samples))) # Add the category index and create sequences of mutations logger.info('[Processing data into sMMM format]') mut_df[CATEGORY_IDX] = mut_df.apply(lambda r: category_index[r[CATEGORY]], axis='columns') mut_df[SAME_ALLELE] = mut_df[REF] #mut_df[SAME_STRAND] = mut_df[REF].replace({REF: {'A': 0, 'C': 1, 'G': 0, 'T': 1}}) mut_df[SAME_STRAND] = mut_df[REF] mut_df = mut_df.replace({SAME_STRAND: {'A': 0, 'C': 1, 'G': 0, 'T': 1}}) sampleToStrandMatch = {s: {} for s in samples} sampleToKataegis = {} for s, sample_df in mut_df.groupby([PATIENT]): # Go sample by sample to figure out when adjacent mutations match sampleToKataegis[s] = sample_df[KATAEGIS].tolist() for col in STRAND_COLUMNS: # Compare adjacent mutations xs = np.array(sample_df[col].tolist()) if col in {SAME_ALLELE, SAME_STRAND}: matched = xs == np.roll(xs, 1) else: # Convert NaNs to -1 because otherwise they could # break the comparison xs[np.isnan(xs)] = 0 xs = xs.astype(bool) matched = xs & np.roll(xs, 1) # Finally, set the first position to False (to be safe) matched[0] = False sampleToStrandMatch[s][col] = matched.astype(int).tolist() sampleToStrandMatch[s][NO_STRAND] = [0] + [1] * (len(sample_df) - 1) sampleToSequence = dict((s, list(map(int, s_df[CATEGORY_IDX]))) for s, s_df in mut_df.groupby([PATIENT])) chromosomeMap = dict( (s, list(s_df[CHR])) for s, s_df in mut_df.groupby([PATIENT])) # Save to JSON logger.info('- Saving to file %s' % args.output_file) with open(args.output_file, 'w') as OUT: output = dict(sampleToSequence=sampleToSequence, sampleToStrandMatch=sampleToStrandMatch, sampleToKataegis=sampleToKataegis, chromosomeMap=chromosomeMap, samples=samples, categories=categories, params=vars(args)) json.dump(output, OUT)
def main(args): """ The main function to reproducing the paper's results :param command: 'loo' for leave one out. 'viterbi' for viterbi :param model: 'sigma' or 'mmm' :param batch: Takes indices from batch * batch_size to (batch + 1) * batch_size :param batch_size: see batch :param threshold: Define maximal distance (in bp) in clouds. Use 0 to not split (i.e use whole chromosomes) :param max_iterations: Maximal number of iterations for training the model :param epsilon: Minimum improvement in every iteration of the model, if improvement is lower stop training :param random_state: Random state to initialize the models :param out_dir: where to save all the files :return: """ # Create simple logger logger = get_logger(args.verbosity) logger.info('[Loading input data]') # Get the list of samples we're going to run on with open(args.mutations_file, 'r') as IN: obj = json.load(IN) samples = obj.get('samples') categories = obj.get('categories') if len(args.sample_names) == 0: sample_indices = range(len(samples)) else: sample_indices = [samples.index(s) for s in args.sample_names] logger.info('- Loading data for %s samples' % len(sample_indices)) # Load the emissions matrix sig_df = pd.read_csv(args.signatures_file, sep='\t', index_col=0) emissions = sig_df.values if len(args.active_signatures) > 0: emissions = emissions[np.array(args.active_signatures) - 1] assert (list(sig_df.columns) == categories) logger.info('- Loaded %s x %s emissions matrix' % emissions.shape) # if threshold <= 0: # out_dir_for_file = os.path.join(out_dir, model) # threshold = 1e99 # else: # out_dir_for_file = os.path.join(out_dir, model + '_' + str(threshold)) experiment_tuples = get_split_sequences_by_threshold( args.mutations_file, args.cloud_threshold, sample_indices) # Perform the experiments logger.info('[Performing experiments]') if args.cross_validation_mode: logger.info('- Cross-validation mode') func = leave_one_out else: logger.info('- Viterbi mode') func = get_viterbi for experiment_tuple in tqdm(experiment_tuples, total=len(sample_indices), ncols=80): sample = experiment_tuple[0] out_file = '%s/%s-%s' % (args.output_directory, args.model_name, sample) dict_to_save = func(experiment_tuple, args.model_name, emissions, args.max_iter, args.tolerance, args.random_state) to_json(out_file, dict_to_save) logger.info('- Done')
def run(args): # Set up logger logger = get_logger(args.verbosity) logger.info('[Loading input data]') # Load the signatures sig_df = pd.read_csv(args.signatures_file, sep='\t', index_col=0) categories = list(sig_df.columns) category_index = dict(zip(categories, range(len(categories)))) logger.info('- Loaded %s x %s signature matrix' % sig_df.values.shape) # Load the mutations mut_df = pd.read_csv( args.mutations_file, sep='\t', usecols=[PATIENT, CATEGORY, MUT_DIST, CHROMOSOME, START_POS], dtype={ PATIENT: str, CATEGORY: str, MUT_DIST: np.float, CHROMOSOME: str, START_POS: int }) samples = sorted(set(mut_df[PATIENT])) logger.info('- Loaded %s mutations in %s samples' % (len(mut_df), len(samples))) # If a mappability blacklist is provided, use it remove mutations if not (args.mappability_blacklist_file is None): # Load the dataframe and process into a dictionary mappability_df = pd.read_csv(args.mappability_blacklist_file, sep=',') chrom_idx, start_idx, stop_idx = mappability_df.columns[:3] map_blacklist = defaultdict(list) unmappable_bases = 0 for _, r in mappability_df.iterrows(): chrom = r[chrom_idx][3:] map_blacklist[chrom].append((int(r[start_idx]), int(r[stop_idx]))) unmappable_bases += map_blacklist[chrom][-1][1] - map_blacklist[ chrom][-1][0] logger.info( '- Loaded unmappable regions spanning %s bases in %s chromosomes' % (unmappable_bases, len(map_blacklist))) # Restrict mutations that fall in a blacklisted region logger.info('[Removing unmappable mutations]') def mappable(r): for start, stop in map_blacklist[r[CHROMOSOME]]: if start <= r[START_POS] <= stop: return False return True n_muts = len(mut_df) mut_df = mut_df[mut_df.apply(mappable, axis='columns')] n_unmappable = n_muts - len(mut_df) logger.info('\t-> Removed %s mutations that were not mappable' % n_unmappable) # Add the category index and create sequences of mutations logger.info('[Processing data into SigMa format]') mut_df[CATEGORY_IDX] = mut_df.apply(lambda r: category_index[r[CATEGORY]], axis='columns') sampleToSequence = dict((s, list(map(int, s_df[CATEGORY_IDX]))) for s, s_df in mut_df.groupby([PATIENT])) sampleToPrevMutDists = dict((s, list(map(float, s_df[MUT_DIST]))) for s, s_df in mut_df.groupby([PATIENT])) # Save to JSON logger.info('- Saving to file %s' % args.output_file) with open(args.output_file, 'w') as OUT: output = dict(sampleToSequence=sampleToSequence, sampleToPrevMutDists=sampleToPrevMutDists, samples=samples, categories=categories, params=vars(args)) json.dump(output, OUT)
def train(param): # 检查参数 assert param.clip < 5.1, "gradient clip should't be too much" assert 0 <= param.dropout < 1, "dropout rate between 0 and 1" assert param.lr > 0, "learning rate must larger than zero" # 数据准备 train_manager = BatchManager(param.batch_size, name='train') number_dataset = train_manager.len_data print("total of number train data is {}".format(number_dataset)) # 创建相应的文件夹 make_path(param) # 配置日志 logger = get_logger(param.train_log_file) # 读取字典 mapping_dict = get_dict(param.dict_file) # 读取senc_tag为后续加载词向量做准备 senc_tag = get_sent_tag(param.sent_tag_file) # 加载预训练向量 dico_chars, char_to_id, id_to_char = augment_with_pretrained( mapping_dict['word'][2].copy(), param.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in senc_tag]))) # 获取总的训练集数据数量 steps_per_epoch = train_manager.len_data # 配置GPU参数 gpu_config = tf.ConfigProto() with tf.Session(config=gpu_config) as sess: # 初始化模型 model = creat_model(sess, Model, param.ckpt_path, load_word2vec, param, id_to_char, logger, map_all=mapping_dict) for i in range(param.max_epoch): loss = [] total_loss = 0 # 初始化时间 start = time.time() for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, batch) # 这里计算平均loss loss.append(batch_loss) # 这里计算总的loss后面计算全部平均 total_loss += batch_loss if step % 5 == 0: logger.info( "epoch:{}, step:{}/{}, avg_loss:{:>9.4f}".format( i + 1, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) # 保存模型 model.save_model(sess, logger, i) logger.info('Epoch {}, total Loss {:.4f}'.format( i + 1, total_loss / train_manager.len_data)) logger.info( 'Time taken for one epoch {:.4f} min, take {:.2f} h for rest of epoch\n' .format((time.time() - start) / 60, ((param.max_epoch - i + 1) * (time.time() - start)) / 3600))
def run(args): # Set up logger logger = get_logger(args.verbosity) logger.info('[Loading input data]') # Load strand information and process into dictionaries strand_df = pd.read_csv(args.strand_info_file, sep='\t') strand_keys = list(zip(strand_df['tChr'], strand_df['tPos0'])) is_left = dict(zip(strand_keys, map(bool, strand_df['tIsLeft']))) is_right = dict(zip(strand_keys, map(bool, strand_df['tIsRight']))) is_tx_plus = dict(zip(strand_keys, map(bool, strand_df['tTxPlus']))) is_tx_minus = dict(zip(strand_keys, map(bool, strand_df['tTxMinus']))) logger.info('- Loaded strand information in %s bins' % len(strand_df)) # Define the main categorization by strand function. We return six categories: # 1) Lagging strand # 2) Leading strand # 3) Template (genic regions only) # 4) Non template # 5) Transcription and replication in the same direction (genic regions only) # 6) Transcription and replication in opposite directions def categorize_by_stand(mut): # Get the chromosome and bin start positions chrom = 'chr%s' % mut[CHR] pos = mut[POS_START] bin_start = np.floor(pos / 20000) * 20000 strand_key = sk = (chrom, bin_start) is_ref_plus = mut[REF] in {'C', 'T'} is_ref_minus = not is_ref_plus # Classify leading/lagging if not (is_left[sk] or is_right[sk]): leading = np.nan lagging = np.nan elif (is_ref_plus and is_left[sk]) or (is_ref_minus and is_right[sk]): lagging = 0 leading = 1 else: lagging = 1 leading = 0 # Classify template/non-template if not (is_tx_plus[sk] or is_tx_minus[sk]): template = np.nan non_template = np.nan elif (is_ref_plus and is_tx_plus[sk]) or (is_ref_minus and is_tx_minus[sk]): template = 0 non_template = 1 else: template = 1 non_template = 0 # Classify tx/rep as same/opposite if not (is_left[sk] or is_right[sk]) or not (is_tx_plus[sk] or is_tx_minus[sk]): rep_tx_same = np.nan rep_tx_opposite = np.nan elif (is_right[sk] and is_tx_plus[sk]) or (is_left[sk] and is_tx_minus[sk]): rep_tx_same = 1 rep_tx_opposite = 0 else: rep_tx_same = 0 rep_tx_opposite = 1 return leading, lagging, template, non_template, rep_tx_same, rep_tx_opposite # Load mutations mut_df = pd.read_csv(args.mutations_file, sep='\t') columns = mut_df.columns samples = set(mut_df[PATIENT]) logger.info('- Loaded %s mutations from %s samples' % (len(mut_df), len(samples))) # Add strand information logger.info('[Adding strand information]') strand_categories = list( zip(*[categorize_by_stand(m) for _, m in mut_df.iterrows()])) strand_category_names = [ LEADING, LAGGING, TEMPLATE, NON_TEMPLATE, REP_TX_SAME_DIR, REP_TX_OPP_DIR ] for i, strand_cat in enumerate(strand_category_names): mut_df[strand_cat] = strand_categories[i] logger.info('- %s mutations "%s"' % (np.nansum(strand_categories[i]), strand_cat)) # Save updated mutations dataframe to file logger.info('[Outputting updated dataframe to file]') logger.info('- Writing table to %s' % args.output_file) mut_df = mut_df[columns.tolist() + strand_category_names] mut_df.to_csv(args.output_file, sep='\t', index=False)
# FIXME # 1. change save_dir working # 2. load_emb : increase vocab size # 3. include out_predict args = utils.parse_args() LOG_FILE = args.save_dir + "/log_file" # set up logging # logger will output to both sys.stdout and # to log_file in save_dir logger = utils.get_logger("Bi-LSTM_CNN_CRF", LOG_FILE) logger.info("#" * 50) data_utils.get_logger(LOG_FILE) collection.get_log_file_path(LOG_FILE) # generate data set for training data = data_utils.load_data(args) train_char_x = data["char"][0] train_word_x = data["train_word"][0] train_word_y = np.expand_dims(data["train_word"][1], -1) train_orth_char_x = data["orth_char"][0] train_orth_x = data["train_orth"] n_train_examples = len(train_word_x) dev_char_x = data["char"][1] dev_word_x = data["dev_word"][0] dev_word_y = data["dev_word"][1] dev_mask = data["dev_word"][2]
from __future__ import print_function, division import os import time import argparse from glob import glob from tensorflow import keras import numpy as np from image_similarity_measures.quality_metrics import psnr, uiq, sam, sre from data_utils import get_logger from patches import recompose_images, OpenDataFilesTest, OpenDataFiles logger = get_logger(__name__) SCALE = 2000 MODEL_PATH = "../models/" def write_final_dict(metric, metric_dict): # Create a directory to save the text file of including evaluation values. predict_path = "val_predict/" if not os.path.exists(predict_path): os.makedirs(predict_path) with open(os.path.join(predict_path, metric + '.txt'), 'w') as f: f.writelines('{}:{}\n'.format(k, v) for k, v in metric_dict.items())
def test(): """ Function to test of the models :return: """ models = list() for i in range(utils.num_folds): models.append( load_model(model_utils.model_filepath.format(i), custom_objects={'mywloss': mywloss})) import time start = time.time() remain_df = None def the_unique(x): return [x[i] for i in range(len(x)) if x[i] != x[i - 1]] for i_c, df in enumerate( pd.read_csv(utils.test_data_filepath, chunksize=utils.test_chunksize, iterator=True)): unique_ids = the_unique(df['object_id'].tolist()) new_remain_df = df.loc[df['object_id'] == unique_ids[-1]].copy() if remain_df is None: df = df.loc[df['object_id'].isin(unique_ids[:-1])].copy() else: df = pd.concat( [remain_df, df.loc[df['object_id'].isin(unique_ids[:-1])]], axis=0) # Create remaining samples df remain_df = new_remain_df preds_np_arr = predict_chunk(df_=df, models=models) preds_df = pd.DataFrame(data=preds_np_arr) print('Shape of predictions: {}'.format(preds_np_arr.shape)) if i_c == 0: preds_df.to_csv(utils.predictions_file, header=False, index=False, float_format='%.6f') else: preds_df.to_csv(utils.predictions_file, header=False, mode='a', index=False, float_format='%.6f') del preds_np_arr, preds_df gc.collect() if (i_c + 1) % 10 == 0: utils.get_logger().info('%15d done in %5.1f' % (utils.test_chunksize * (i_c + 1), (time.time() - start) / 60)) print('%15d done in %5.1f' % (utils.test_chunksize * (i_c + 1), (time.time() - start) / 60)) # Compute last object in remain_df preds_np_arr = predict_chunk(df_=remain_df, models=models) preds_df = pd.DataFrame(data=preds_np_arr) preds_df.to_csv(utils.predictions_file, header=False, mode='a', index=False, float_format='%.6f') z = pd.read_csv(utils.predictions_file) z = z.groupby('object_id').mean() z.to_csv(utils.final_predictions_file, index=True, float_format='%.6f')
gc.collect() if (i_c + 1) % 10 == 0: utils.get_logger().info('%15d done in %5.1f' % (utils.test_chunksize * (i_c + 1), (time.time() - start) / 60)) print('%15d done in %5.1f' % (utils.test_chunksize * (i_c + 1), (time.time() - start) / 60)) # Compute last object in remain_df preds_np_arr = predict_chunk(df_=remain_df, models=models) preds_df = pd.DataFrame(data=preds_np_arr) preds_df.to_csv(utils.predictions_file, header=False, mode='a', index=False, float_format='%.6f') z = pd.read_csv(utils.predictions_file) z = z.groupby('object_id').mean() z.to_csv(utils.final_predictions_file, index=True, float_format='%.6f') if __name__ == '__main__': gc.enable() utils.create_logger() try: test() except Exception: utils.get_logger().exception('Unexpected Exception Occurred') raise